{ "best_metric": 1.6814647912979126, "best_model_checkpoint": "/work/hdd/berb/mli23/models/llama3_inversion_t5-base_msmarco_len=128/checkpoint-1150000", "epoch": 68.43305843130373, "eval_steps": 25000, "global_step": 1170000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0058489793531028835, "grad_norm": 12.943063735961914, "learning_rate": 4e-05, "loss": 4.6555, "step": 100 }, { "epoch": 0.011697958706205767, "grad_norm": 3.1292779445648193, "learning_rate": 8e-05, "loss": 3.7364, "step": 200 }, { "epoch": 0.01754693805930865, "grad_norm": 1.0327661037445068, "learning_rate": 0.00012, "loss": 3.5109, "step": 300 }, { "epoch": 0.023395917412411534, "grad_norm": 0.9033965468406677, "learning_rate": 0.00016, "loss": 3.3908, "step": 400 }, { "epoch": 0.02924489676551442, "grad_norm": 1.1280786991119385, "learning_rate": 0.0002, "loss": 3.315, "step": 500 }, { "epoch": 0.0350938761186173, "grad_norm": 0.7215750217437744, "learning_rate": 0.00024, "loss": 3.2784, "step": 600 }, { "epoch": 0.040942855471720184, "grad_norm": 0.6628429889678955, "learning_rate": 0.00028000000000000003, "loss": 3.2406, "step": 700 }, { "epoch": 0.04679183482482307, "grad_norm": 0.7271576523780823, "learning_rate": 0.00032, "loss": 3.2257, "step": 800 }, { "epoch": 0.05264081417792595, "grad_norm": 0.8473170399665833, "learning_rate": 0.00035999999999999997, "loss": 3.1971, "step": 900 }, { "epoch": 0.05848979353102884, "grad_norm": 1.2454655170440674, "learning_rate": 0.0004, "loss": 3.2009, "step": 1000 }, { "epoch": 0.06433877288413171, "grad_norm": 0.8206815123558044, "learning_rate": 0.00044, "loss": 3.1683, "step": 1100 }, { "epoch": 0.0701877522372346, "grad_norm": 0.45900511741638184, "learning_rate": 0.00048, "loss": 3.1488, "step": 1200 }, { "epoch": 0.07603673159033748, "grad_norm": 0.612880527973175, "learning_rate": 0.0005200000000000001, "loss": 3.1296, "step": 1300 }, { "epoch": 0.08188571094344037, "grad_norm": 0.7772127985954285, "learning_rate": 0.0005600000000000001, "loss": 3.1207, "step": 1400 }, { "epoch": 0.08773469029654325, "grad_norm": 0.544497013092041, "learning_rate": 0.0006, "loss": 3.0966, "step": 1500 }, { "epoch": 0.09358366964964614, "grad_norm": 0.5593262314796448, "learning_rate": 0.00064, "loss": 3.0818, "step": 1600 }, { "epoch": 0.09943264900274902, "grad_norm": 0.42569369077682495, "learning_rate": 0.00068, "loss": 3.0717, "step": 1700 }, { "epoch": 0.1052816283558519, "grad_norm": 0.3750356137752533, "learning_rate": 0.0007199999999999999, "loss": 3.0816, "step": 1800 }, { "epoch": 0.11113060770895479, "grad_norm": 0.475235253572464, "learning_rate": 0.00076, "loss": 3.0608, "step": 1900 }, { "epoch": 0.11697958706205767, "grad_norm": 1.0628883838653564, "learning_rate": 0.0008, "loss": 3.0603, "step": 2000 }, { "epoch": 0.12282856641516056, "grad_norm": 0.44049274921417236, "learning_rate": 0.00084, "loss": 3.0459, "step": 2100 }, { "epoch": 0.12867754576826343, "grad_norm": 0.40404266119003296, "learning_rate": 0.00088, "loss": 3.0488, "step": 2200 }, { "epoch": 0.1345265251213663, "grad_norm": 0.9033539891242981, "learning_rate": 0.00092, "loss": 3.0332, "step": 2300 }, { "epoch": 0.1403755044744692, "grad_norm": 0.4601661264896393, "learning_rate": 0.00096, "loss": 3.0417, "step": 2400 }, { "epoch": 0.14622448382757208, "grad_norm": 0.9950599074363708, "learning_rate": 0.001, "loss": 3.0202, "step": 2500 }, { "epoch": 0.15207346318067497, "grad_norm": 0.9481270909309387, "learning_rate": 0.001, "loss": 3.0027, "step": 2600 }, { "epoch": 0.15792244253377785, "grad_norm": 0.304583877325058, "learning_rate": 0.001, "loss": 3.0025, "step": 2700 }, { "epoch": 0.16377142188688074, "grad_norm": 1.099980115890503, "learning_rate": 0.001, "loss": 2.9885, "step": 2800 }, { "epoch": 0.16962040123998362, "grad_norm": 4.135697364807129, "learning_rate": 0.001, "loss": 2.9906, "step": 2900 }, { "epoch": 0.1754693805930865, "grad_norm": 1.533362627029419, "learning_rate": 0.001, "loss": 2.9816, "step": 3000 }, { "epoch": 0.1813183599461894, "grad_norm": 0.5210053324699402, "learning_rate": 0.001, "loss": 2.9759, "step": 3100 }, { "epoch": 0.18716733929929227, "grad_norm": 0.5690032839775085, "learning_rate": 0.001, "loss": 2.9607, "step": 3200 }, { "epoch": 0.19301631865239516, "grad_norm": 0.2918831408023834, "learning_rate": 0.001, "loss": 2.9593, "step": 3300 }, { "epoch": 0.19886529800549804, "grad_norm": 0.277378648519516, "learning_rate": 0.001, "loss": 2.9456, "step": 3400 }, { "epoch": 0.20471427735860093, "grad_norm": 0.5827824473381042, "learning_rate": 0.001, "loss": 2.9309, "step": 3500 }, { "epoch": 0.2105632567117038, "grad_norm": 0.3150762915611267, "learning_rate": 0.001, "loss": 2.9296, "step": 3600 }, { "epoch": 0.2164122360648067, "grad_norm": 0.5635539293289185, "learning_rate": 0.001, "loss": 2.9152, "step": 3700 }, { "epoch": 0.22226121541790958, "grad_norm": 0.3011772036552429, "learning_rate": 0.001, "loss": 2.9122, "step": 3800 }, { "epoch": 0.22811019477101246, "grad_norm": 0.4245956838130951, "learning_rate": 0.001, "loss": 2.8949, "step": 3900 }, { "epoch": 0.23395917412411535, "grad_norm": 0.3619990050792694, "learning_rate": 0.001, "loss": 2.8932, "step": 4000 }, { "epoch": 0.23980815347721823, "grad_norm": 0.4192410707473755, "learning_rate": 0.001, "loss": 2.8909, "step": 4100 }, { "epoch": 0.24565713283032112, "grad_norm": 0.5433981418609619, "learning_rate": 0.001, "loss": 2.884, "step": 4200 }, { "epoch": 0.251506112183424, "grad_norm": 0.2839597463607788, "learning_rate": 0.001, "loss": 2.8704, "step": 4300 }, { "epoch": 0.25735509153652686, "grad_norm": 0.328612744808197, "learning_rate": 0.001, "loss": 2.8675, "step": 4400 }, { "epoch": 0.26320407088962977, "grad_norm": 0.30383020639419556, "learning_rate": 0.001, "loss": 2.8639, "step": 4500 }, { "epoch": 0.2690530502427326, "grad_norm": 0.3395404815673828, "learning_rate": 0.001, "loss": 2.8497, "step": 4600 }, { "epoch": 0.27490202959583554, "grad_norm": 0.45567628741264343, "learning_rate": 0.001, "loss": 2.8563, "step": 4700 }, { "epoch": 0.2807510089489384, "grad_norm": 0.34391820430755615, "learning_rate": 0.001, "loss": 2.8659, "step": 4800 }, { "epoch": 0.2865999883020413, "grad_norm": 0.3900957405567169, "learning_rate": 0.001, "loss": 2.8528, "step": 4900 }, { "epoch": 0.29244896765514417, "grad_norm": 0.29294323921203613, "learning_rate": 0.001, "loss": 2.8519, "step": 5000 }, { "epoch": 0.2982979470082471, "grad_norm": 0.559441864490509, "learning_rate": 0.001, "loss": 2.8493, "step": 5100 }, { "epoch": 0.30414692636134993, "grad_norm": 0.47313785552978516, "learning_rate": 0.001, "loss": 2.8415, "step": 5200 }, { "epoch": 0.30999590571445285, "grad_norm": 0.4527204632759094, "learning_rate": 0.001, "loss": 2.8246, "step": 5300 }, { "epoch": 0.3158448850675557, "grad_norm": 0.35146230459213257, "learning_rate": 0.001, "loss": 2.8307, "step": 5400 }, { "epoch": 0.3216938644206586, "grad_norm": 0.3346210718154907, "learning_rate": 0.001, "loss": 2.833, "step": 5500 }, { "epoch": 0.32754284377376147, "grad_norm": 0.25525936484336853, "learning_rate": 0.001, "loss": 2.824, "step": 5600 }, { "epoch": 0.3333918231268644, "grad_norm": 0.32972151041030884, "learning_rate": 0.001, "loss": 2.8086, "step": 5700 }, { "epoch": 0.33924080247996724, "grad_norm": 0.4162798225879669, "learning_rate": 0.001, "loss": 2.8045, "step": 5800 }, { "epoch": 0.34508978183307015, "grad_norm": 0.26571205258369446, "learning_rate": 0.001, "loss": 2.8113, "step": 5900 }, { "epoch": 0.350938761186173, "grad_norm": 0.37533682584762573, "learning_rate": 0.001, "loss": 2.7994, "step": 6000 }, { "epoch": 0.3567877405392759, "grad_norm": 0.2862384021282196, "learning_rate": 0.001, "loss": 2.7961, "step": 6100 }, { "epoch": 0.3626367198923788, "grad_norm": 0.47438132762908936, "learning_rate": 0.001, "loss": 2.7944, "step": 6200 }, { "epoch": 0.3684856992454817, "grad_norm": 0.557257890701294, "learning_rate": 0.001, "loss": 2.7927, "step": 6300 }, { "epoch": 0.37433467859858455, "grad_norm": 0.31082427501678467, "learning_rate": 0.001, "loss": 2.7977, "step": 6400 }, { "epoch": 0.3801836579516874, "grad_norm": 0.3456556797027588, "learning_rate": 0.001, "loss": 2.794, "step": 6500 }, { "epoch": 0.3860326373047903, "grad_norm": 0.3142741024494171, "learning_rate": 0.001, "loss": 2.8067, "step": 6600 }, { "epoch": 0.3918816166578932, "grad_norm": 0.6210492253303528, "learning_rate": 0.001, "loss": 2.7929, "step": 6700 }, { "epoch": 0.3977305960109961, "grad_norm": 0.24653126299381256, "learning_rate": 0.001, "loss": 2.7774, "step": 6800 }, { "epoch": 0.40357957536409894, "grad_norm": 0.3158399760723114, "learning_rate": 0.001, "loss": 2.7842, "step": 6900 }, { "epoch": 0.40942855471720185, "grad_norm": 0.5692638754844666, "learning_rate": 0.001, "loss": 2.7955, "step": 7000 }, { "epoch": 0.4152775340703047, "grad_norm": 0.2916674017906189, "learning_rate": 0.001, "loss": 2.7784, "step": 7100 }, { "epoch": 0.4211265134234076, "grad_norm": 0.30405744910240173, "learning_rate": 0.001, "loss": 2.7733, "step": 7200 }, { "epoch": 0.4269754927765105, "grad_norm": 0.49492916464805603, "learning_rate": 0.001, "loss": 2.7894, "step": 7300 }, { "epoch": 0.4328244721296134, "grad_norm": 0.36178985238075256, "learning_rate": 0.001, "loss": 2.7834, "step": 7400 }, { "epoch": 0.43867345148271625, "grad_norm": 0.3495817184448242, "learning_rate": 0.001, "loss": 2.7723, "step": 7500 }, { "epoch": 0.44452243083581916, "grad_norm": 0.32232311367988586, "learning_rate": 0.001, "loss": 2.7656, "step": 7600 }, { "epoch": 0.450371410188922, "grad_norm": 0.2932673990726471, "learning_rate": 0.001, "loss": 2.757, "step": 7700 }, { "epoch": 0.45622038954202493, "grad_norm": 0.7692651152610779, "learning_rate": 0.001, "loss": 2.7619, "step": 7800 }, { "epoch": 0.4620693688951278, "grad_norm": 0.3152422606945038, "learning_rate": 0.001, "loss": 2.7592, "step": 7900 }, { "epoch": 0.4679183482482307, "grad_norm": 0.42381253838539124, "learning_rate": 0.001, "loss": 2.7553, "step": 8000 }, { "epoch": 0.47376732760133355, "grad_norm": 0.2534554600715637, "learning_rate": 0.001, "loss": 2.762, "step": 8100 }, { "epoch": 0.47961630695443647, "grad_norm": 0.35539475083351135, "learning_rate": 0.001, "loss": 2.7459, "step": 8200 }, { "epoch": 0.4854652863075393, "grad_norm": 0.4595663845539093, "learning_rate": 0.001, "loss": 2.7367, "step": 8300 }, { "epoch": 0.49131426566064224, "grad_norm": 0.4989442825317383, "learning_rate": 0.001, "loss": 2.7409, "step": 8400 }, { "epoch": 0.4971632450137451, "grad_norm": 0.25956428050994873, "learning_rate": 0.001, "loss": 2.7416, "step": 8500 }, { "epoch": 0.503012224366848, "grad_norm": 0.2869381606578827, "learning_rate": 0.001, "loss": 2.7371, "step": 8600 }, { "epoch": 0.5088612037199509, "grad_norm": 0.3240751028060913, "learning_rate": 0.001, "loss": 2.7292, "step": 8700 }, { "epoch": 0.5147101830730537, "grad_norm": 0.28903892636299133, "learning_rate": 0.001, "loss": 2.7206, "step": 8800 }, { "epoch": 0.5205591624261566, "grad_norm": 0.3670004904270172, "learning_rate": 0.001, "loss": 2.728, "step": 8900 }, { "epoch": 0.5264081417792595, "grad_norm": 0.33026307821273804, "learning_rate": 0.001, "loss": 2.7169, "step": 9000 }, { "epoch": 0.5322571211323625, "grad_norm": 0.5844357013702393, "learning_rate": 0.001, "loss": 2.7304, "step": 9100 }, { "epoch": 0.5381061004854653, "grad_norm": 0.28374913334846497, "learning_rate": 0.001, "loss": 2.7222, "step": 9200 }, { "epoch": 0.5439550798385682, "grad_norm": 0.404506117105484, "learning_rate": 0.001, "loss": 2.7176, "step": 9300 }, { "epoch": 0.5498040591916711, "grad_norm": 0.6227047443389893, "learning_rate": 0.001, "loss": 2.7057, "step": 9400 }, { "epoch": 0.555653038544774, "grad_norm": 0.2927371561527252, "learning_rate": 0.001, "loss": 2.7093, "step": 9500 }, { "epoch": 0.5615020178978768, "grad_norm": 0.37304234504699707, "learning_rate": 0.001, "loss": 2.7127, "step": 9600 }, { "epoch": 0.5673509972509797, "grad_norm": 0.8582624197006226, "learning_rate": 0.001, "loss": 2.702, "step": 9700 }, { "epoch": 0.5731999766040826, "grad_norm": 0.3817168176174164, "learning_rate": 0.001, "loss": 2.6966, "step": 9800 }, { "epoch": 0.5790489559571854, "grad_norm": 0.2752266526222229, "learning_rate": 0.001, "loss": 2.6921, "step": 9900 }, { "epoch": 0.5848979353102883, "grad_norm": 0.3967178165912628, "learning_rate": 0.001, "loss": 2.6933, "step": 10000 }, { "epoch": 0.5907469146633912, "grad_norm": 0.41086024045944214, "learning_rate": 0.001, "loss": 2.7005, "step": 10100 }, { "epoch": 0.5965958940164942, "grad_norm": 0.47755008935928345, "learning_rate": 0.001, "loss": 2.6886, "step": 10200 }, { "epoch": 0.602444873369597, "grad_norm": 0.5245695114135742, "learning_rate": 0.001, "loss": 2.6924, "step": 10300 }, { "epoch": 0.6082938527226999, "grad_norm": 0.3606848418712616, "learning_rate": 0.001, "loss": 2.685, "step": 10400 }, { "epoch": 0.6141428320758028, "grad_norm": 0.2209954708814621, "learning_rate": 0.001, "loss": 2.681, "step": 10500 }, { "epoch": 0.6199918114289057, "grad_norm": 0.40113091468811035, "learning_rate": 0.001, "loss": 2.6839, "step": 10600 }, { "epoch": 0.6258407907820085, "grad_norm": 0.32870161533355713, "learning_rate": 0.001, "loss": 2.6796, "step": 10700 }, { "epoch": 0.6316897701351114, "grad_norm": 0.35721340775489807, "learning_rate": 0.001, "loss": 2.6754, "step": 10800 }, { "epoch": 0.6375387494882143, "grad_norm": 0.2920335829257965, "learning_rate": 0.001, "loss": 2.6679, "step": 10900 }, { "epoch": 0.6433877288413172, "grad_norm": 0.3933546543121338, "learning_rate": 0.001, "loss": 2.675, "step": 11000 }, { "epoch": 0.64923670819442, "grad_norm": 0.3200457990169525, "learning_rate": 0.001, "loss": 2.6679, "step": 11100 }, { "epoch": 0.6550856875475229, "grad_norm": 0.34713250398635864, "learning_rate": 0.001, "loss": 2.6776, "step": 11200 }, { "epoch": 0.6609346669006259, "grad_norm": 0.26418769359588623, "learning_rate": 0.001, "loss": 2.6714, "step": 11300 }, { "epoch": 0.6667836462537288, "grad_norm": 0.27883028984069824, "learning_rate": 0.001, "loss": 2.6654, "step": 11400 }, { "epoch": 0.6726326256068316, "grad_norm": 0.2562033534049988, "learning_rate": 0.001, "loss": 2.6572, "step": 11500 }, { "epoch": 0.6784816049599345, "grad_norm": 0.568248987197876, "learning_rate": 0.001, "loss": 2.6612, "step": 11600 }, { "epoch": 0.6843305843130374, "grad_norm": 0.3265298902988434, "learning_rate": 0.001, "loss": 2.6601, "step": 11700 }, { "epoch": 0.6901795636661403, "grad_norm": 0.41186779737472534, "learning_rate": 0.001, "loss": 2.6645, "step": 11800 }, { "epoch": 0.6960285430192431, "grad_norm": 0.2928140461444855, "learning_rate": 0.001, "loss": 2.6624, "step": 11900 }, { "epoch": 0.701877522372346, "grad_norm": 0.2873474955558777, "learning_rate": 0.001, "loss": 2.669, "step": 12000 }, { "epoch": 0.7077265017254489, "grad_norm": 0.36888572573661804, "learning_rate": 0.001, "loss": 2.6639, "step": 12100 }, { "epoch": 0.7135754810785518, "grad_norm": 0.3335033357143402, "learning_rate": 0.001, "loss": 2.6682, "step": 12200 }, { "epoch": 0.7194244604316546, "grad_norm": 0.4510112702846527, "learning_rate": 0.001, "loss": 2.6541, "step": 12300 }, { "epoch": 0.7252734397847576, "grad_norm": 0.3824751675128937, "learning_rate": 0.001, "loss": 2.6476, "step": 12400 }, { "epoch": 0.7311224191378605, "grad_norm": 0.30925267934799194, "learning_rate": 0.001, "loss": 2.6442, "step": 12500 }, { "epoch": 0.7369713984909634, "grad_norm": 0.25113895535469055, "learning_rate": 0.001, "loss": 2.6529, "step": 12600 }, { "epoch": 0.7428203778440662, "grad_norm": 0.5017949342727661, "learning_rate": 0.001, "loss": 2.6439, "step": 12700 }, { "epoch": 0.7486693571971691, "grad_norm": 0.42189210653305054, "learning_rate": 0.001, "loss": 2.6576, "step": 12800 }, { "epoch": 0.754518336550272, "grad_norm": 0.31621694564819336, "learning_rate": 0.001, "loss": 2.6365, "step": 12900 }, { "epoch": 0.7603673159033748, "grad_norm": 0.22871503233909607, "learning_rate": 0.001, "loss": 2.6386, "step": 13000 }, { "epoch": 0.7662162952564777, "grad_norm": 0.2661151885986328, "learning_rate": 0.001, "loss": 2.6363, "step": 13100 }, { "epoch": 0.7720652746095806, "grad_norm": 0.2954743504524231, "learning_rate": 0.001, "loss": 2.6286, "step": 13200 }, { "epoch": 0.7779142539626835, "grad_norm": 0.6017025113105774, "learning_rate": 0.001, "loss": 2.6259, "step": 13300 }, { "epoch": 0.7837632333157863, "grad_norm": 0.4094967246055603, "learning_rate": 0.001, "loss": 2.6294, "step": 13400 }, { "epoch": 0.7896122126688893, "grad_norm": 0.22659087181091309, "learning_rate": 0.001, "loss": 2.6261, "step": 13500 }, { "epoch": 0.7954611920219922, "grad_norm": 0.3093152344226837, "learning_rate": 0.001, "loss": 2.6267, "step": 13600 }, { "epoch": 0.8013101713750951, "grad_norm": 0.4502885937690735, "learning_rate": 0.001, "loss": 2.6187, "step": 13700 }, { "epoch": 0.8071591507281979, "grad_norm": 0.5576587319374084, "learning_rate": 0.001, "loss": 2.6069, "step": 13800 }, { "epoch": 0.8130081300813008, "grad_norm": 0.3503821790218353, "learning_rate": 0.001, "loss": 2.6163, "step": 13900 }, { "epoch": 0.8188571094344037, "grad_norm": 0.360505074262619, "learning_rate": 0.001, "loss": 2.6323, "step": 14000 }, { "epoch": 0.8247060887875066, "grad_norm": 0.36237767338752747, "learning_rate": 0.001, "loss": 2.6214, "step": 14100 }, { "epoch": 0.8305550681406094, "grad_norm": 0.5022086501121521, "learning_rate": 0.001, "loss": 2.6174, "step": 14200 }, { "epoch": 0.8364040474937123, "grad_norm": 0.4685794413089752, "learning_rate": 0.001, "loss": 2.6212, "step": 14300 }, { "epoch": 0.8422530268468152, "grad_norm": 0.2740049958229065, "learning_rate": 0.001, "loss": 2.6092, "step": 14400 }, { "epoch": 0.8481020061999182, "grad_norm": 0.35477593541145325, "learning_rate": 0.001, "loss": 2.615, "step": 14500 }, { "epoch": 0.853950985553021, "grad_norm": 0.3610607981681824, "learning_rate": 0.001, "loss": 2.5985, "step": 14600 }, { "epoch": 0.8597999649061239, "grad_norm": 0.2814117968082428, "learning_rate": 0.001, "loss": 2.6018, "step": 14700 }, { "epoch": 0.8656489442592268, "grad_norm": 0.5826596021652222, "learning_rate": 0.001, "loss": 2.609, "step": 14800 }, { "epoch": 0.8714979236123297, "grad_norm": 0.24935407936573029, "learning_rate": 0.001, "loss": 2.6125, "step": 14900 }, { "epoch": 0.8773469029654325, "grad_norm": 0.46704015135765076, "learning_rate": 0.001, "loss": 2.5899, "step": 15000 }, { "epoch": 0.8831958823185354, "grad_norm": 0.5216036438941956, "learning_rate": 0.001, "loss": 2.6002, "step": 15100 }, { "epoch": 0.8890448616716383, "grad_norm": 0.2542150914669037, "learning_rate": 0.001, "loss": 2.6043, "step": 15200 }, { "epoch": 0.8948938410247412, "grad_norm": 0.6096458435058594, "learning_rate": 0.001, "loss": 2.6029, "step": 15300 }, { "epoch": 0.900742820377844, "grad_norm": 0.49453744292259216, "learning_rate": 0.001, "loss": 2.5948, "step": 15400 }, { "epoch": 0.906591799730947, "grad_norm": 0.3526987135410309, "learning_rate": 0.001, "loss": 2.5956, "step": 15500 }, { "epoch": 0.9124407790840499, "grad_norm": 0.5171267986297607, "learning_rate": 0.001, "loss": 2.5904, "step": 15600 }, { "epoch": 0.9182897584371528, "grad_norm": 0.46608448028564453, "learning_rate": 0.001, "loss": 2.5864, "step": 15700 }, { "epoch": 0.9241387377902556, "grad_norm": 0.49829718470573425, "learning_rate": 0.001, "loss": 2.587, "step": 15800 }, { "epoch": 0.9299877171433585, "grad_norm": 0.4127972424030304, "learning_rate": 0.001, "loss": 2.5961, "step": 15900 }, { "epoch": 0.9358366964964614, "grad_norm": 0.3038919270038605, "learning_rate": 0.001, "loss": 2.586, "step": 16000 }, { "epoch": 0.9416856758495642, "grad_norm": 0.33385053277015686, "learning_rate": 0.001, "loss": 2.5853, "step": 16100 }, { "epoch": 0.9475346552026671, "grad_norm": 0.4260696470737457, "learning_rate": 0.001, "loss": 2.5774, "step": 16200 }, { "epoch": 0.95338363455577, "grad_norm": 0.3453454375267029, "learning_rate": 0.001, "loss": 2.576, "step": 16300 }, { "epoch": 0.9592326139088729, "grad_norm": 0.45317602157592773, "learning_rate": 0.001, "loss": 2.5697, "step": 16400 }, { "epoch": 0.9650815932619757, "grad_norm": 0.36661919951438904, "learning_rate": 0.001, "loss": 2.568, "step": 16500 }, { "epoch": 0.9709305726150786, "grad_norm": 0.3288170099258423, "learning_rate": 0.001, "loss": 2.5717, "step": 16600 }, { "epoch": 0.9767795519681816, "grad_norm": 0.24539104104042053, "learning_rate": 0.001, "loss": 2.5704, "step": 16700 }, { "epoch": 0.9826285313212845, "grad_norm": 0.37299594283103943, "learning_rate": 0.001, "loss": 2.5715, "step": 16800 }, { "epoch": 0.9884775106743873, "grad_norm": 0.3232550024986267, "learning_rate": 0.001, "loss": 2.5709, "step": 16900 }, { "epoch": 0.9943264900274902, "grad_norm": 0.34340640902519226, "learning_rate": 0.001, "loss": 2.5605, "step": 17000 }, { "epoch": 1.000175469380593, "grad_norm": 0.20594754815101624, "learning_rate": 0.001, "loss": 2.5617, "step": 17100 }, { "epoch": 1.006024448733696, "grad_norm": 0.20585927367210388, "learning_rate": 0.001, "loss": 2.5429, "step": 17200 }, { "epoch": 1.011873428086799, "grad_norm": 0.25559356808662415, "learning_rate": 0.001, "loss": 2.5386, "step": 17300 }, { "epoch": 1.0177224074399018, "grad_norm": 0.18721798062324524, "learning_rate": 0.001, "loss": 2.5354, "step": 17400 }, { "epoch": 1.0235713867930045, "grad_norm": 0.3241952061653137, "learning_rate": 0.001, "loss": 2.5377, "step": 17500 }, { "epoch": 1.0294203661461074, "grad_norm": 0.26688364148139954, "learning_rate": 0.001, "loss": 2.5344, "step": 17600 }, { "epoch": 1.0352693454992103, "grad_norm": 0.21780164539813995, "learning_rate": 0.001, "loss": 2.5367, "step": 17700 }, { "epoch": 1.0411183248523133, "grad_norm": 0.1688920259475708, "learning_rate": 0.001, "loss": 2.5348, "step": 17800 }, { "epoch": 1.0469673042054162, "grad_norm": 0.2468491941690445, "learning_rate": 0.001, "loss": 2.5289, "step": 17900 }, { "epoch": 1.052816283558519, "grad_norm": 0.2752334177494049, "learning_rate": 0.001, "loss": 2.5349, "step": 18000 }, { "epoch": 1.058665262911622, "grad_norm": 0.2685433030128479, "learning_rate": 0.001, "loss": 2.5278, "step": 18100 }, { "epoch": 1.064514242264725, "grad_norm": 0.3018209636211395, "learning_rate": 0.001, "loss": 2.5278, "step": 18200 }, { "epoch": 1.0703632216178276, "grad_norm": 0.22612260282039642, "learning_rate": 0.001, "loss": 2.5287, "step": 18300 }, { "epoch": 1.0762122009709305, "grad_norm": 0.19808165729045868, "learning_rate": 0.001, "loss": 2.5247, "step": 18400 }, { "epoch": 1.0820611803240334, "grad_norm": 0.2071445733308792, "learning_rate": 0.001, "loss": 2.5244, "step": 18500 }, { "epoch": 1.0879101596771363, "grad_norm": 0.2557687759399414, "learning_rate": 0.001, "loss": 2.5203, "step": 18600 }, { "epoch": 1.0937591390302392, "grad_norm": 0.1922091841697693, "learning_rate": 0.001, "loss": 2.5248, "step": 18700 }, { "epoch": 1.0996081183833422, "grad_norm": 0.28338736295700073, "learning_rate": 0.001, "loss": 2.5181, "step": 18800 }, { "epoch": 1.105457097736445, "grad_norm": 0.2635754644870758, "learning_rate": 0.001, "loss": 2.5184, "step": 18900 }, { "epoch": 1.111306077089548, "grad_norm": 0.22370187938213348, "learning_rate": 0.001, "loss": 2.5183, "step": 19000 }, { "epoch": 1.1171550564426507, "grad_norm": 0.243075430393219, "learning_rate": 0.001, "loss": 2.5166, "step": 19100 }, { "epoch": 1.1230040357957536, "grad_norm": 0.21709586679935455, "learning_rate": 0.001, "loss": 2.5157, "step": 19200 }, { "epoch": 1.1288530151488565, "grad_norm": 0.19943048059940338, "learning_rate": 0.001, "loss": 2.5143, "step": 19300 }, { "epoch": 1.1347019945019594, "grad_norm": 0.25194117426872253, "learning_rate": 0.001, "loss": 2.512, "step": 19400 }, { "epoch": 1.1405509738550623, "grad_norm": 0.2807503342628479, "learning_rate": 0.001, "loss": 2.5129, "step": 19500 }, { "epoch": 1.1463999532081652, "grad_norm": 0.18253426253795624, "learning_rate": 0.001, "loss": 2.5042, "step": 19600 }, { "epoch": 1.1522489325612681, "grad_norm": 0.2365122139453888, "learning_rate": 0.001, "loss": 2.507, "step": 19700 }, { "epoch": 1.1580979119143708, "grad_norm": 0.33415156602859497, "learning_rate": 0.001, "loss": 2.5059, "step": 19800 }, { "epoch": 1.1639468912674737, "grad_norm": 0.21700790524482727, "learning_rate": 0.001, "loss": 2.5153, "step": 19900 }, { "epoch": 1.1697958706205767, "grad_norm": 0.24956712126731873, "learning_rate": 0.001, "loss": 2.5154, "step": 20000 }, { "epoch": 1.1756448499736796, "grad_norm": 0.24380935728549957, "learning_rate": 0.001, "loss": 2.508, "step": 20100 }, { "epoch": 1.1814938293267825, "grad_norm": 0.18379876017570496, "learning_rate": 0.001, "loss": 2.5096, "step": 20200 }, { "epoch": 1.1873428086798854, "grad_norm": 0.2209005206823349, "learning_rate": 0.001, "loss": 2.5053, "step": 20300 }, { "epoch": 1.1931917880329883, "grad_norm": 0.16225199401378632, "learning_rate": 0.001, "loss": 2.5041, "step": 20400 }, { "epoch": 1.1990407673860912, "grad_norm": 0.2412300556898117, "learning_rate": 0.001, "loss": 2.4954, "step": 20500 }, { "epoch": 1.2048897467391941, "grad_norm": 0.16954025626182556, "learning_rate": 0.001, "loss": 2.496, "step": 20600 }, { "epoch": 1.2107387260922968, "grad_norm": 0.2393045425415039, "learning_rate": 0.001, "loss": 2.4922, "step": 20700 }, { "epoch": 1.2165877054453997, "grad_norm": 0.22956933081150055, "learning_rate": 0.001, "loss": 2.498, "step": 20800 }, { "epoch": 1.2224366847985026, "grad_norm": 0.19904771447181702, "learning_rate": 0.001, "loss": 2.4949, "step": 20900 }, { "epoch": 1.2282856641516056, "grad_norm": 0.16651825606822968, "learning_rate": 0.001, "loss": 2.4853, "step": 21000 }, { "epoch": 1.2341346435047085, "grad_norm": 0.21016888320446014, "learning_rate": 0.001, "loss": 2.4921, "step": 21100 }, { "epoch": 1.2399836228578114, "grad_norm": 0.23262888193130493, "learning_rate": 0.001, "loss": 2.4879, "step": 21200 }, { "epoch": 1.245832602210914, "grad_norm": 0.30768677592277527, "learning_rate": 0.001, "loss": 2.4853, "step": 21300 }, { "epoch": 1.251681581564017, "grad_norm": 0.2619354724884033, "learning_rate": 0.001, "loss": 2.4794, "step": 21400 }, { "epoch": 1.25753056091712, "grad_norm": 0.19283078610897064, "learning_rate": 0.001, "loss": 2.4845, "step": 21500 }, { "epoch": 1.2633795402702228, "grad_norm": 0.20024755597114563, "learning_rate": 0.001, "loss": 2.4792, "step": 21600 }, { "epoch": 1.2692285196233257, "grad_norm": 0.2168792337179184, "learning_rate": 0.001, "loss": 2.4938, "step": 21700 }, { "epoch": 1.2750774989764286, "grad_norm": 0.2448788583278656, "learning_rate": 0.001, "loss": 2.4819, "step": 21800 }, { "epoch": 1.2809264783295315, "grad_norm": 0.2325625866651535, "learning_rate": 0.001, "loss": 2.4963, "step": 21900 }, { "epoch": 1.2867754576826345, "grad_norm": 0.2712757885456085, "learning_rate": 0.001, "loss": 2.482, "step": 22000 }, { "epoch": 1.2926244370357374, "grad_norm": 0.2578014135360718, "learning_rate": 0.001, "loss": 2.4954, "step": 22100 }, { "epoch": 1.2984734163888403, "grad_norm": 0.24209511280059814, "learning_rate": 0.001, "loss": 2.4838, "step": 22200 }, { "epoch": 1.304322395741943, "grad_norm": 0.2685130834579468, "learning_rate": 0.001, "loss": 2.4759, "step": 22300 }, { "epoch": 1.3101713750950459, "grad_norm": 0.24375727772712708, "learning_rate": 0.001, "loss": 2.4765, "step": 22400 }, { "epoch": 1.3160203544481488, "grad_norm": 0.19386479258537292, "learning_rate": 0.001, "loss": 2.4789, "step": 22500 }, { "epoch": 1.3218693338012517, "grad_norm": 0.21841387450695038, "learning_rate": 0.001, "loss": 2.4732, "step": 22600 }, { "epoch": 1.3277183131543546, "grad_norm": 0.21536213159561157, "learning_rate": 0.001, "loss": 2.4711, "step": 22700 }, { "epoch": 1.3335672925074573, "grad_norm": 0.18462149798870087, "learning_rate": 0.001, "loss": 2.4664, "step": 22800 }, { "epoch": 1.3394162718605602, "grad_norm": 0.2527564465999603, "learning_rate": 0.001, "loss": 2.4647, "step": 22900 }, { "epoch": 1.3452652512136631, "grad_norm": 0.20399995148181915, "learning_rate": 0.001, "loss": 2.4617, "step": 23000 }, { "epoch": 1.351114230566766, "grad_norm": 0.34047460556030273, "learning_rate": 0.001, "loss": 2.4656, "step": 23100 }, { "epoch": 1.356963209919869, "grad_norm": 0.2932854890823364, "learning_rate": 0.001, "loss": 2.4583, "step": 23200 }, { "epoch": 1.3628121892729719, "grad_norm": 0.2324647158384323, "learning_rate": 0.001, "loss": 2.4551, "step": 23300 }, { "epoch": 1.3686611686260748, "grad_norm": 0.19455721974372864, "learning_rate": 0.001, "loss": 2.4638, "step": 23400 }, { "epoch": 1.3745101479791777, "grad_norm": 0.23765920102596283, "learning_rate": 0.001, "loss": 2.4542, "step": 23500 }, { "epoch": 1.3803591273322806, "grad_norm": 0.24041135609149933, "learning_rate": 0.001, "loss": 2.457, "step": 23600 }, { "epoch": 1.3862081066853835, "grad_norm": 0.20191903412342072, "learning_rate": 0.001, "loss": 2.4555, "step": 23700 }, { "epoch": 1.3920570860384862, "grad_norm": 0.2119702845811844, "learning_rate": 0.001, "loss": 2.4538, "step": 23800 }, { "epoch": 1.3979060653915891, "grad_norm": 0.22643940150737762, "learning_rate": 0.001, "loss": 2.4543, "step": 23900 }, { "epoch": 1.403755044744692, "grad_norm": 0.21883685886859894, "learning_rate": 0.001, "loss": 2.4483, "step": 24000 }, { "epoch": 1.409604024097795, "grad_norm": 0.24098359048366547, "learning_rate": 0.001, "loss": 2.4535, "step": 24100 }, { "epoch": 1.4154530034508979, "grad_norm": 0.2588041126728058, "learning_rate": 0.001, "loss": 2.4536, "step": 24200 }, { "epoch": 1.4213019828040008, "grad_norm": 0.16840530931949615, "learning_rate": 0.001, "loss": 2.4545, "step": 24300 }, { "epoch": 1.4271509621571035, "grad_norm": 0.19587203860282898, "learning_rate": 0.001, "loss": 2.4569, "step": 24400 }, { "epoch": 1.4329999415102064, "grad_norm": 0.24545466899871826, "learning_rate": 0.001, "loss": 2.4451, "step": 24500 }, { "epoch": 1.4388489208633093, "grad_norm": 0.2628709375858307, "learning_rate": 0.001, "loss": 2.447, "step": 24600 }, { "epoch": 1.4446979002164122, "grad_norm": 0.21540479362010956, "learning_rate": 0.001, "loss": 2.4464, "step": 24700 }, { "epoch": 1.4505468795695151, "grad_norm": 0.21766430139541626, "learning_rate": 0.001, "loss": 2.4425, "step": 24800 }, { "epoch": 1.456395858922618, "grad_norm": 0.21005024015903473, "learning_rate": 0.001, "loss": 2.443, "step": 24900 }, { "epoch": 1.462244838275721, "grad_norm": 0.22057472169399261, "learning_rate": 0.001, "loss": 2.4383, "step": 25000 }, { "epoch": 1.462244838275721, "eval_ag_news_accuracy": 0.21165625, "eval_ag_news_bleu_score": 3.534427252554638, "eval_ag_news_bleu_score_sem": 0.2644906233469291, "eval_ag_news_emb_cos_sim": 0.5801689028739929, "eval_ag_news_emb_cos_sim_sem": 0.01702900417149067, "eval_ag_news_emb_top1_equal": 0.90625, "eval_ag_news_emb_top1_equal_sem": 0.025864720344543457, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 3.1293857097625732, "eval_ag_news_n_ngrams_match_1": 10.0078125, "eval_ag_news_n_ngrams_match_2": 1.796875, "eval_ag_news_n_ngrams_match_3": 0.4765625, "eval_ag_news_num_pred_words": 48.796875, "eval_ag_news_num_true_words": 44.875, "eval_ag_news_perplexity": 22.859932595009848, "eval_ag_news_pred_num_tokens": 78.6875, "eval_ag_news_rouge_score": 0.20277509797065635, "eval_ag_news_runtime": 37.2319, "eval_ag_news_samples_per_second": 13.429, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.24310378881789194, "eval_ag_news_token_set_f1_sem": 0.00892504227228375, "eval_ag_news_token_set_precision": 0.2167832498877239, "eval_ag_news_token_set_recall": 0.2942497821204974, "eval_ag_news_true_num_tokens": 62.6640625, "step": 25000 }, { "epoch": 1.462244838275721, "eval_anthropic_toxic_prompts_accuracy": 0.08565625, "eval_anthropic_toxic_prompts_bleu_score": 18.887297319205643, "eval_anthropic_toxic_prompts_bleu_score_sem": 1.4707935666978205, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.7681809663772583, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.012376615777611732, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.0078125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0078125, "eval_anthropic_toxic_prompts_loss": 1.723764419555664, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 6.890625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 2.8828125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 1.4140625, "eval_anthropic_toxic_prompts_num_pred_words": 16.015625, "eval_anthropic_toxic_prompts_num_true_words": 15.6640625, "eval_anthropic_toxic_prompts_perplexity": 5.605590590904464, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.7890625, "eval_anthropic_toxic_prompts_rouge_score": 0.47356545167148234, "eval_anthropic_toxic_prompts_runtime": 30.1892, "eval_anthropic_toxic_prompts_samples_per_second": 16.562, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.5086950659027967, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01692214654434852, "eval_anthropic_toxic_prompts_token_set_precision": 0.49032902240230625, "eval_anthropic_toxic_prompts_token_set_recall": 0.5427313329529544, "eval_anthropic_toxic_prompts_true_num_tokens": 18.9453125, "step": 25000 }, { "epoch": 1.462244838275721, "eval_arxiv_accuracy": 0.35, "eval_arxiv_bleu_score": 0.8860690052492396, "eval_arxiv_bleu_score_sem": 0.0988396138130459, "eval_arxiv_emb_cos_sim": 0.26209062337875366, "eval_arxiv_emb_cos_sim_sem": 0.014273686334490776, "eval_arxiv_emb_top1_equal": 0.6640625, "eval_arxiv_emb_top1_equal_sem": 0.04191137105226517, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.656667709350586, "eval_arxiv_n_ngrams_match_1": 8.1171875, "eval_arxiv_n_ngrams_match_2": 0.9375, "eval_arxiv_n_ngrams_match_3": 0.1015625, "eval_arxiv_num_pred_words": 48.2421875, "eval_arxiv_num_true_words": 85.9140625, "eval_arxiv_perplexity": 38.7320611037018, "eval_arxiv_pred_num_tokens": 125.3984375, "eval_arxiv_rouge_score": 0.11036677223613531, "eval_arxiv_runtime": 30.9622, "eval_arxiv_samples_per_second": 16.149, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.10541477804392752, "eval_arxiv_token_set_f1_sem": 0.007379828117335166, "eval_arxiv_token_set_precision": 0.06670887006127653, "eval_arxiv_token_set_recall": 0.354045581347542, "eval_arxiv_true_num_tokens": 125.2421875, "step": 25000 }, { "epoch": 1.462244838275721, "eval_python_code_alpaca_accuracy": 0.109015625, "eval_python_code_alpaca_bleu_score": 11.287982296178086, "eval_python_code_alpaca_bleu_score_sem": 0.8190556329124217, "eval_python_code_alpaca_emb_cos_sim": 0.7332732081413269, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011542395688593388, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 2.019791603088379, "eval_python_code_alpaca_n_ngrams_match_1": 7.6328125, "eval_python_code_alpaca_n_ngrams_match_2": 2.515625, "eval_python_code_alpaca_n_ngrams_match_3": 0.921875, "eval_python_code_alpaca_num_pred_words": 19.7734375, "eval_python_code_alpaca_num_true_words": 19.8203125, "eval_python_code_alpaca_perplexity": 7.536754133707565, "eval_python_code_alpaca_pred_num_tokens": 26.1796875, "eval_python_code_alpaca_rouge_score": 0.3947081904759825, "eval_python_code_alpaca_runtime": 29.915, "eval_python_code_alpaca_samples_per_second": 16.714, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.42478266085734845, "eval_python_code_alpaca_token_set_f1_sem": 0.01261734235459831, "eval_python_code_alpaca_token_set_precision": 0.4184851916966724, "eval_python_code_alpaca_token_set_recall": 0.4424055991043919, "eval_python_code_alpaca_true_num_tokens": 25.1171875, "step": 25000 }, { "epoch": 1.462244838275721, "eval_wikibio_accuracy": 0.340078125, "eval_wikibio_bleu_score": 3.512822151826979, "eval_wikibio_bleu_score_sem": 0.40629797178092536, "eval_wikibio_emb_cos_sim": 0.40614253282546997, "eval_wikibio_emb_cos_sim_sem": 0.02155975252389908, "eval_wikibio_emb_top1_equal": 0.6875, "eval_wikibio_emb_top1_equal_sem": 0.04113007336854935, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.957691192626953, "eval_wikibio_n_ngrams_match_1": 11.453125, "eval_wikibio_n_ngrams_match_2": 2.8671875, "eval_wikibio_n_ngrams_match_3": 0.875, "eval_wikibio_num_pred_words": 51.1171875, "eval_wikibio_num_true_words": 52.7578125, "eval_wikibio_perplexity": 19.253467851269257, "eval_wikibio_pred_num_tokens": 109.2109375, "eval_wikibio_rouge_score": 0.1935694544425893, "eval_wikibio_runtime": 29.9972, "eval_wikibio_samples_per_second": 16.668, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.21231180189619753, "eval_wikibio_token_set_f1_sem": 0.013504009726642315, "eval_wikibio_token_set_precision": 0.17731241247264234, "eval_wikibio_token_set_recall": 0.2919202947330419, "eval_wikibio_true_num_tokens": 101.6171875, "step": 25000 }, { "epoch": 1.462244838275721, "eval_msmarco_accuracy": 0.336140625, "eval_msmarco_bleu_score": 8.364193537436286, "eval_msmarco_bleu_score_sem": 1.1674156371442077, "eval_msmarco_emb_cos_sim": 0.6448519229888916, "eval_msmarco_emb_cos_sim_sem": 0.018446683883666992, "eval_msmarco_emb_top1_equal": 0.90625, "eval_msmarco_emb_top1_equal_sem": 0.025864720344543457, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 2.2558319568634033, "eval_msmarco_n_ngrams_match_1": 19.703125, "eval_msmarco_n_ngrams_match_2": 6.6171875, "eval_msmarco_n_ngrams_match_3": 3.1953125, "eval_msmarco_num_pred_words": 58.7734375, "eval_msmarco_num_true_words": 59.53125, "eval_msmarco_perplexity": 9.543229563713254, "eval_msmarco_pred_num_tokens": 83.890625, "eval_msmarco_rouge_score": 0.30456460267417534, "eval_msmarco_runtime": 25.1307, "eval_msmarco_samples_per_second": 19.896, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.34679900540712416, "eval_msmarco_token_set_f1_sem": 0.014196388090400448, "eval_msmarco_token_set_precision": 0.2979990213974149, "eval_msmarco_token_set_recall": 0.4459538956722221, "eval_msmarco_true_num_tokens": 76.21875, "step": 25000 }, { "epoch": 1.4680938176288238, "grad_norm": 0.28528645634651184, "learning_rate": 0.001, "loss": 2.4451, "step": 25100 }, { "epoch": 1.4739427969819268, "grad_norm": 0.2357073575258255, "learning_rate": 0.001, "loss": 2.4427, "step": 25200 }, { "epoch": 1.4797917763350297, "grad_norm": 0.2571132183074951, "learning_rate": 0.001, "loss": 2.4344, "step": 25300 }, { "epoch": 1.4856407556881324, "grad_norm": 0.21259337663650513, "learning_rate": 0.001, "loss": 2.4315, "step": 25400 }, { "epoch": 1.4914897350412353, "grad_norm": 0.2394593209028244, "learning_rate": 0.001, "loss": 2.4322, "step": 25500 }, { "epoch": 1.4973387143943382, "grad_norm": 0.24678024649620056, "learning_rate": 0.001, "loss": 2.4309, "step": 25600 }, { "epoch": 1.503187693747441, "grad_norm": 0.26587486267089844, "learning_rate": 0.001, "loss": 2.4364, "step": 25700 }, { "epoch": 1.5090366731005438, "grad_norm": 0.1972772628068924, "learning_rate": 0.001, "loss": 2.431, "step": 25800 }, { "epoch": 1.5148856524536467, "grad_norm": 0.17835430800914764, "learning_rate": 0.001, "loss": 2.4305, "step": 25900 }, { "epoch": 1.5207346318067496, "grad_norm": 0.19541141390800476, "learning_rate": 0.001, "loss": 2.427, "step": 26000 }, { "epoch": 1.5265836111598525, "grad_norm": 0.2052622139453888, "learning_rate": 0.001, "loss": 2.4414, "step": 26100 }, { "epoch": 1.5324325905129554, "grad_norm": 0.226291224360466, "learning_rate": 0.001, "loss": 2.4274, "step": 26200 }, { "epoch": 1.5382815698660584, "grad_norm": 0.2687996029853821, "learning_rate": 0.001, "loss": 2.4332, "step": 26300 }, { "epoch": 1.5441305492191613, "grad_norm": 0.24806822836399078, "learning_rate": 0.001, "loss": 2.4302, "step": 26400 }, { "epoch": 1.5499795285722642, "grad_norm": 0.19609582424163818, "learning_rate": 0.001, "loss": 2.4327, "step": 26500 }, { "epoch": 1.555828507925367, "grad_norm": 0.24692796170711517, "learning_rate": 0.001, "loss": 2.4273, "step": 26600 }, { "epoch": 1.56167748727847, "grad_norm": 0.22361747920513153, "learning_rate": 0.001, "loss": 2.4223, "step": 26700 }, { "epoch": 1.567526466631573, "grad_norm": 0.22509080171585083, "learning_rate": 0.001, "loss": 2.4277, "step": 26800 }, { "epoch": 1.5733754459846758, "grad_norm": 0.18645142018795013, "learning_rate": 0.001, "loss": 2.4215, "step": 26900 }, { "epoch": 1.5792244253377785, "grad_norm": 0.25416824221611023, "learning_rate": 0.001, "loss": 2.4291, "step": 27000 }, { "epoch": 1.5850734046908814, "grad_norm": 0.20248450338840485, "learning_rate": 0.001, "loss": 2.428, "step": 27100 }, { "epoch": 1.5909223840439843, "grad_norm": 0.17877057194709778, "learning_rate": 0.001, "loss": 2.4232, "step": 27200 }, { "epoch": 1.5967713633970873, "grad_norm": 0.2392747700214386, "learning_rate": 0.001, "loss": 2.4222, "step": 27300 }, { "epoch": 1.60262034275019, "grad_norm": 0.23741014301776886, "learning_rate": 0.001, "loss": 2.4217, "step": 27400 }, { "epoch": 1.6084693221032929, "grad_norm": 0.2881547510623932, "learning_rate": 0.001, "loss": 2.4226, "step": 27500 }, { "epoch": 1.6143183014563958, "grad_norm": 0.26170873641967773, "learning_rate": 0.001, "loss": 2.421, "step": 27600 }, { "epoch": 1.6201672808094987, "grad_norm": 0.20851947367191315, "learning_rate": 0.001, "loss": 2.4147, "step": 27700 }, { "epoch": 1.6260162601626016, "grad_norm": 0.31872814893722534, "learning_rate": 0.001, "loss": 2.4149, "step": 27800 }, { "epoch": 1.6318652395157045, "grad_norm": 0.39523184299468994, "learning_rate": 0.001, "loss": 2.4234, "step": 27900 }, { "epoch": 1.6377142188688074, "grad_norm": 0.2895563542842865, "learning_rate": 0.001, "loss": 2.4182, "step": 28000 }, { "epoch": 1.6435631982219103, "grad_norm": 0.2978422939777374, "learning_rate": 0.001, "loss": 2.4127, "step": 28100 }, { "epoch": 1.6494121775750132, "grad_norm": 0.22559039294719696, "learning_rate": 0.001, "loss": 2.4117, "step": 28200 }, { "epoch": 1.6552611569281162, "grad_norm": 0.2733854651451111, "learning_rate": 0.001, "loss": 2.4151, "step": 28300 }, { "epoch": 1.661110136281219, "grad_norm": 0.27238959074020386, "learning_rate": 0.001, "loss": 2.4063, "step": 28400 }, { "epoch": 1.666959115634322, "grad_norm": 0.29666972160339355, "learning_rate": 0.001, "loss": 2.4103, "step": 28500 }, { "epoch": 1.6728080949874247, "grad_norm": 0.26020702719688416, "learning_rate": 0.001, "loss": 2.4039, "step": 28600 }, { "epoch": 1.6786570743405276, "grad_norm": 0.2111108899116516, "learning_rate": 0.001, "loss": 2.3994, "step": 28700 }, { "epoch": 1.6845060536936305, "grad_norm": 0.25818005204200745, "learning_rate": 0.001, "loss": 2.4009, "step": 28800 }, { "epoch": 1.6903550330467332, "grad_norm": 0.2186482697725296, "learning_rate": 0.001, "loss": 2.4081, "step": 28900 }, { "epoch": 1.696204012399836, "grad_norm": 0.32617518305778503, "learning_rate": 0.001, "loss": 2.4038, "step": 29000 }, { "epoch": 1.702052991752939, "grad_norm": 0.20414134860038757, "learning_rate": 0.001, "loss": 2.4034, "step": 29100 }, { "epoch": 1.707901971106042, "grad_norm": 0.20291084051132202, "learning_rate": 0.001, "loss": 2.3975, "step": 29200 }, { "epoch": 1.7137509504591448, "grad_norm": 0.21581241488456726, "learning_rate": 0.001, "loss": 2.4004, "step": 29300 }, { "epoch": 1.7195999298122477, "grad_norm": 0.18834257125854492, "learning_rate": 0.001, "loss": 2.3971, "step": 29400 }, { "epoch": 1.7254489091653507, "grad_norm": 0.2372192144393921, "learning_rate": 0.001, "loss": 2.4003, "step": 29500 }, { "epoch": 1.7312978885184536, "grad_norm": 0.34013861417770386, "learning_rate": 0.001, "loss": 2.3926, "step": 29600 }, { "epoch": 1.7371468678715565, "grad_norm": 0.24962738156318665, "learning_rate": 0.001, "loss": 2.3931, "step": 29700 }, { "epoch": 1.7429958472246594, "grad_norm": 0.22313758730888367, "learning_rate": 0.001, "loss": 2.3956, "step": 29800 }, { "epoch": 1.7488448265777623, "grad_norm": 0.2607101500034332, "learning_rate": 0.001, "loss": 2.3964, "step": 29900 }, { "epoch": 1.7546938059308652, "grad_norm": 0.25132322311401367, "learning_rate": 0.001, "loss": 2.3964, "step": 30000 }, { "epoch": 1.760542785283968, "grad_norm": 0.31079235672950745, "learning_rate": 0.001, "loss": 2.3956, "step": 30100 }, { "epoch": 1.7663917646370708, "grad_norm": 0.19893445074558258, "learning_rate": 0.001, "loss": 2.3903, "step": 30200 }, { "epoch": 1.7722407439901737, "grad_norm": 0.172132670879364, "learning_rate": 0.001, "loss": 2.3861, "step": 30300 }, { "epoch": 1.7780897233432766, "grad_norm": 0.274661660194397, "learning_rate": 0.001, "loss": 2.3909, "step": 30400 }, { "epoch": 1.7839387026963793, "grad_norm": 0.2610439956188202, "learning_rate": 0.001, "loss": 2.3928, "step": 30500 }, { "epoch": 1.7897876820494822, "grad_norm": 0.27652570605278015, "learning_rate": 0.001, "loss": 2.3876, "step": 30600 }, { "epoch": 1.7956366614025852, "grad_norm": 0.2831563651561737, "learning_rate": 0.001, "loss": 2.3922, "step": 30700 }, { "epoch": 1.801485640755688, "grad_norm": 0.2459115982055664, "learning_rate": 0.001, "loss": 2.3828, "step": 30800 }, { "epoch": 1.807334620108791, "grad_norm": 0.2187494933605194, "learning_rate": 0.001, "loss": 2.3893, "step": 30900 }, { "epoch": 1.813183599461894, "grad_norm": 0.24702590703964233, "learning_rate": 0.001, "loss": 2.3811, "step": 31000 }, { "epoch": 1.8190325788149968, "grad_norm": 0.21576926112174988, "learning_rate": 0.001, "loss": 2.3825, "step": 31100 }, { "epoch": 1.8248815581680997, "grad_norm": 0.24321870505809784, "learning_rate": 0.001, "loss": 2.3859, "step": 31200 }, { "epoch": 1.8307305375212026, "grad_norm": 0.20851227641105652, "learning_rate": 0.001, "loss": 2.3812, "step": 31300 }, { "epoch": 1.8365795168743055, "grad_norm": 0.2823316156864166, "learning_rate": 0.001, "loss": 2.3788, "step": 31400 }, { "epoch": 1.8424284962274085, "grad_norm": 0.2678524851799011, "learning_rate": 0.001, "loss": 2.3815, "step": 31500 }, { "epoch": 1.8482774755805114, "grad_norm": 0.2754901945590973, "learning_rate": 0.001, "loss": 2.3692, "step": 31600 }, { "epoch": 1.854126454933614, "grad_norm": 0.28908759355545044, "learning_rate": 0.001, "loss": 2.3777, "step": 31700 }, { "epoch": 1.859975434286717, "grad_norm": 0.3080601990222931, "learning_rate": 0.001, "loss": 2.3766, "step": 31800 }, { "epoch": 1.8658244136398199, "grad_norm": 0.2102145552635193, "learning_rate": 0.001, "loss": 2.3767, "step": 31900 }, { "epoch": 1.8716733929929226, "grad_norm": 0.23772719502449036, "learning_rate": 0.001, "loss": 2.3773, "step": 32000 }, { "epoch": 1.8775223723460255, "grad_norm": 0.29551535844802856, "learning_rate": 0.001, "loss": 2.3805, "step": 32100 }, { "epoch": 1.8833713516991284, "grad_norm": 0.18449169397354126, "learning_rate": 0.001, "loss": 2.3765, "step": 32200 }, { "epoch": 1.8892203310522313, "grad_norm": 0.23476749658584595, "learning_rate": 0.001, "loss": 2.3724, "step": 32300 }, { "epoch": 1.8950693104053342, "grad_norm": 0.2339320331811905, "learning_rate": 0.001, "loss": 2.3724, "step": 32400 }, { "epoch": 1.9009182897584371, "grad_norm": 0.29611435532569885, "learning_rate": 0.001, "loss": 2.3714, "step": 32500 }, { "epoch": 1.90676726911154, "grad_norm": 0.2156270146369934, "learning_rate": 0.001, "loss": 2.3639, "step": 32600 }, { "epoch": 1.912616248464643, "grad_norm": 0.25915658473968506, "learning_rate": 0.001, "loss": 2.3666, "step": 32700 }, { "epoch": 1.9184652278177459, "grad_norm": 0.2258334904909134, "learning_rate": 0.001, "loss": 2.3686, "step": 32800 }, { "epoch": 1.9243142071708488, "grad_norm": 0.22068630158901215, "learning_rate": 0.001, "loss": 2.366, "step": 32900 }, { "epoch": 1.9301631865239517, "grad_norm": 0.31538712978363037, "learning_rate": 0.001, "loss": 2.3656, "step": 33000 }, { "epoch": 1.9360121658770546, "grad_norm": 0.20591576397418976, "learning_rate": 0.001, "loss": 2.3636, "step": 33100 }, { "epoch": 1.9418611452301573, "grad_norm": 0.24140793085098267, "learning_rate": 0.001, "loss": 2.3586, "step": 33200 }, { "epoch": 1.9477101245832602, "grad_norm": 0.18351712822914124, "learning_rate": 0.001, "loss": 2.364, "step": 33300 }, { "epoch": 1.9535591039363631, "grad_norm": 0.2056625336408615, "learning_rate": 0.001, "loss": 2.3642, "step": 33400 }, { "epoch": 1.959408083289466, "grad_norm": 0.18169744312763214, "learning_rate": 0.001, "loss": 2.3652, "step": 33500 }, { "epoch": 1.9652570626425687, "grad_norm": 0.23775456845760345, "learning_rate": 0.001, "loss": 2.3585, "step": 33600 }, { "epoch": 1.9711060419956716, "grad_norm": 0.1885794699192047, "learning_rate": 0.001, "loss": 2.356, "step": 33700 }, { "epoch": 1.9769550213487745, "grad_norm": 0.2882946729660034, "learning_rate": 0.001, "loss": 2.3646, "step": 33800 }, { "epoch": 1.9828040007018775, "grad_norm": 0.1672821044921875, "learning_rate": 0.001, "loss": 2.3592, "step": 33900 }, { "epoch": 1.9886529800549804, "grad_norm": 0.3228713274002075, "learning_rate": 0.001, "loss": 2.365, "step": 34000 }, { "epoch": 1.9945019594080833, "grad_norm": 0.20571596920490265, "learning_rate": 0.001, "loss": 2.3561, "step": 34100 }, { "epoch": 2.000350938761186, "grad_norm": 0.23046864569187164, "learning_rate": 0.001, "loss": 2.3575, "step": 34200 }, { "epoch": 2.006199918114289, "grad_norm": 0.27128100395202637, "learning_rate": 0.001, "loss": 2.3312, "step": 34300 }, { "epoch": 2.012048897467392, "grad_norm": 0.27084216475486755, "learning_rate": 0.001, "loss": 2.336, "step": 34400 }, { "epoch": 2.017897876820495, "grad_norm": 0.24598562717437744, "learning_rate": 0.001, "loss": 2.3353, "step": 34500 }, { "epoch": 2.023746856173598, "grad_norm": 0.2801196873188019, "learning_rate": 0.001, "loss": 2.3317, "step": 34600 }, { "epoch": 2.0295958355267008, "grad_norm": 0.27919960021972656, "learning_rate": 0.001, "loss": 2.3398, "step": 34700 }, { "epoch": 2.0354448148798037, "grad_norm": 0.27122968435287476, "learning_rate": 0.001, "loss": 2.3302, "step": 34800 }, { "epoch": 2.0412937942329066, "grad_norm": 0.26992517709732056, "learning_rate": 0.001, "loss": 2.3317, "step": 34900 }, { "epoch": 2.047142773586009, "grad_norm": 0.2382284551858902, "learning_rate": 0.001, "loss": 2.323, "step": 35000 }, { "epoch": 2.052991752939112, "grad_norm": 0.330167680978775, "learning_rate": 0.001, "loss": 2.3272, "step": 35100 }, { "epoch": 2.058840732292215, "grad_norm": 0.28652581572532654, "learning_rate": 0.001, "loss": 2.3343, "step": 35200 }, { "epoch": 2.064689711645318, "grad_norm": 0.2247895747423172, "learning_rate": 0.001, "loss": 2.3238, "step": 35300 }, { "epoch": 2.0705386909984207, "grad_norm": 0.4077642560005188, "learning_rate": 0.001, "loss": 2.3336, "step": 35400 }, { "epoch": 2.0763876703515236, "grad_norm": 0.23775160312652588, "learning_rate": 0.001, "loss": 2.3253, "step": 35500 }, { "epoch": 2.0822366497046265, "grad_norm": 0.3252675533294678, "learning_rate": 0.001, "loss": 2.3302, "step": 35600 }, { "epoch": 2.0880856290577294, "grad_norm": 0.22632266581058502, "learning_rate": 0.001, "loss": 2.3306, "step": 35700 }, { "epoch": 2.0939346084108323, "grad_norm": 0.2730694115161896, "learning_rate": 0.001, "loss": 2.3305, "step": 35800 }, { "epoch": 2.0997835877639353, "grad_norm": 0.2510974109172821, "learning_rate": 0.001, "loss": 2.3331, "step": 35900 }, { "epoch": 2.105632567117038, "grad_norm": 0.4280712306499481, "learning_rate": 0.001, "loss": 2.3205, "step": 36000 }, { "epoch": 2.111481546470141, "grad_norm": 0.26897507905960083, "learning_rate": 0.001, "loss": 2.3254, "step": 36100 }, { "epoch": 2.117330525823244, "grad_norm": 0.26800480484962463, "learning_rate": 0.001, "loss": 2.3249, "step": 36200 }, { "epoch": 2.123179505176347, "grad_norm": 0.4287492632865906, "learning_rate": 0.001, "loss": 2.3245, "step": 36300 }, { "epoch": 2.12902848452945, "grad_norm": 0.38937726616859436, "learning_rate": 0.001, "loss": 2.327, "step": 36400 }, { "epoch": 2.1348774638825523, "grad_norm": 0.4529838263988495, "learning_rate": 0.001, "loss": 2.3307, "step": 36500 }, { "epoch": 2.140726443235655, "grad_norm": 0.32198020815849304, "learning_rate": 0.001, "loss": 2.3307, "step": 36600 }, { "epoch": 2.146575422588758, "grad_norm": 0.3261450231075287, "learning_rate": 0.001, "loss": 2.3261, "step": 36700 }, { "epoch": 2.152424401941861, "grad_norm": 0.28807538747787476, "learning_rate": 0.001, "loss": 2.3264, "step": 36800 }, { "epoch": 2.158273381294964, "grad_norm": 0.23441243171691895, "learning_rate": 0.001, "loss": 2.3226, "step": 36900 }, { "epoch": 2.164122360648067, "grad_norm": 0.3896152675151825, "learning_rate": 0.001, "loss": 2.3216, "step": 37000 }, { "epoch": 2.1699713400011698, "grad_norm": 0.2384829968214035, "learning_rate": 0.001, "loss": 2.3251, "step": 37100 }, { "epoch": 2.1758203193542727, "grad_norm": 0.33747655153274536, "learning_rate": 0.001, "loss": 2.3188, "step": 37200 }, { "epoch": 2.1816692987073756, "grad_norm": 0.48914507031440735, "learning_rate": 0.001, "loss": 2.324, "step": 37300 }, { "epoch": 2.1875182780604785, "grad_norm": 0.25418758392333984, "learning_rate": 0.001, "loss": 2.3189, "step": 37400 }, { "epoch": 2.1933672574135814, "grad_norm": 0.264364629983902, "learning_rate": 0.001, "loss": 2.3227, "step": 37500 }, { "epoch": 2.1992162367666843, "grad_norm": 0.215361088514328, "learning_rate": 0.001, "loss": 2.3195, "step": 37600 }, { "epoch": 2.2050652161197872, "grad_norm": 0.27733340859413147, "learning_rate": 0.001, "loss": 2.3214, "step": 37700 }, { "epoch": 2.21091419547289, "grad_norm": 0.26398155093193054, "learning_rate": 0.001, "loss": 2.3172, "step": 37800 }, { "epoch": 2.216763174825993, "grad_norm": 0.24838100373744965, "learning_rate": 0.001, "loss": 2.3164, "step": 37900 }, { "epoch": 2.222612154179096, "grad_norm": 0.22284917533397675, "learning_rate": 0.001, "loss": 2.3213, "step": 38000 }, { "epoch": 2.2284611335321984, "grad_norm": 0.3620869815349579, "learning_rate": 0.001, "loss": 2.3131, "step": 38100 }, { "epoch": 2.2343101128853013, "grad_norm": 0.22537147998809814, "learning_rate": 0.001, "loss": 2.3136, "step": 38200 }, { "epoch": 2.2401590922384043, "grad_norm": 0.33595022559165955, "learning_rate": 0.001, "loss": 2.3133, "step": 38300 }, { "epoch": 2.246008071591507, "grad_norm": 0.22938361763954163, "learning_rate": 0.001, "loss": 2.3144, "step": 38400 }, { "epoch": 2.25185705094461, "grad_norm": 0.2649751901626587, "learning_rate": 0.001, "loss": 2.3178, "step": 38500 }, { "epoch": 2.257706030297713, "grad_norm": 0.2797791659832001, "learning_rate": 0.001, "loss": 2.313, "step": 38600 }, { "epoch": 2.263555009650816, "grad_norm": 0.2607049345970154, "learning_rate": 0.001, "loss": 2.3111, "step": 38700 }, { "epoch": 2.269403989003919, "grad_norm": 0.2774815261363983, "learning_rate": 0.001, "loss": 2.305, "step": 38800 }, { "epoch": 2.2752529683570217, "grad_norm": 0.24359388649463654, "learning_rate": 0.001, "loss": 2.3116, "step": 38900 }, { "epoch": 2.2811019477101246, "grad_norm": 0.23390889167785645, "learning_rate": 0.001, "loss": 2.3159, "step": 39000 }, { "epoch": 2.2869509270632276, "grad_norm": 0.26259610056877136, "learning_rate": 0.001, "loss": 2.3081, "step": 39100 }, { "epoch": 2.2927999064163305, "grad_norm": 0.23905432224273682, "learning_rate": 0.001, "loss": 2.3082, "step": 39200 }, { "epoch": 2.2986488857694334, "grad_norm": 0.20884330570697784, "learning_rate": 0.001, "loss": 2.3079, "step": 39300 }, { "epoch": 2.3044978651225363, "grad_norm": 0.24212944507598877, "learning_rate": 0.001, "loss": 2.3107, "step": 39400 }, { "epoch": 2.3103468444756388, "grad_norm": 0.24700592458248138, "learning_rate": 0.001, "loss": 2.3033, "step": 39500 }, { "epoch": 2.3161958238287417, "grad_norm": 0.24097232520580292, "learning_rate": 0.001, "loss": 2.3068, "step": 39600 }, { "epoch": 2.3220448031818446, "grad_norm": 0.2660614252090454, "learning_rate": 0.001, "loss": 2.3035, "step": 39700 }, { "epoch": 2.3278937825349475, "grad_norm": 0.26029708981513977, "learning_rate": 0.001, "loss": 2.3113, "step": 39800 }, { "epoch": 2.3337427618880504, "grad_norm": 0.28577178716659546, "learning_rate": 0.001, "loss": 2.312, "step": 39900 }, { "epoch": 2.3395917412411533, "grad_norm": 0.27604636549949646, "learning_rate": 0.001, "loss": 2.3114, "step": 40000 }, { "epoch": 2.3454407205942562, "grad_norm": 0.28683075308799744, "learning_rate": 0.001, "loss": 2.3005, "step": 40100 }, { "epoch": 2.351289699947359, "grad_norm": 0.2506248950958252, "learning_rate": 0.001, "loss": 2.3019, "step": 40200 }, { "epoch": 2.357138679300462, "grad_norm": 0.2951677143573761, "learning_rate": 0.001, "loss": 2.3021, "step": 40300 }, { "epoch": 2.362987658653565, "grad_norm": 0.25638511776924133, "learning_rate": 0.001, "loss": 2.3055, "step": 40400 }, { "epoch": 2.368836638006668, "grad_norm": 0.19738303124904633, "learning_rate": 0.001, "loss": 2.298, "step": 40500 }, { "epoch": 2.374685617359771, "grad_norm": 0.31506773829460144, "learning_rate": 0.001, "loss": 2.2974, "step": 40600 }, { "epoch": 2.3805345967128737, "grad_norm": 0.2250044345855713, "learning_rate": 0.001, "loss": 2.294, "step": 40700 }, { "epoch": 2.3863835760659766, "grad_norm": 0.23361743986606598, "learning_rate": 0.001, "loss": 2.3032, "step": 40800 }, { "epoch": 2.3922325554190795, "grad_norm": 0.21691115200519562, "learning_rate": 0.001, "loss": 2.3107, "step": 40900 }, { "epoch": 2.3980815347721824, "grad_norm": 0.23371458053588867, "learning_rate": 0.001, "loss": 2.301, "step": 41000 }, { "epoch": 2.4039305141252854, "grad_norm": 0.24374869465827942, "learning_rate": 0.001, "loss": 2.2996, "step": 41100 }, { "epoch": 2.4097794934783883, "grad_norm": 0.19761334359645844, "learning_rate": 0.001, "loss": 2.302, "step": 41200 }, { "epoch": 2.4156284728314907, "grad_norm": 0.24785877764225006, "learning_rate": 0.001, "loss": 2.3058, "step": 41300 }, { "epoch": 2.4214774521845936, "grad_norm": 0.21458207070827484, "learning_rate": 0.001, "loss": 2.3009, "step": 41400 }, { "epoch": 2.4273264315376966, "grad_norm": 0.23316971957683563, "learning_rate": 0.001, "loss": 2.3003, "step": 41500 }, { "epoch": 2.4331754108907995, "grad_norm": 0.308971107006073, "learning_rate": 0.001, "loss": 2.3033, "step": 41600 }, { "epoch": 2.4390243902439024, "grad_norm": 0.30708208680152893, "learning_rate": 0.001, "loss": 2.2971, "step": 41700 }, { "epoch": 2.4448733695970053, "grad_norm": 0.26860737800598145, "learning_rate": 0.001, "loss": 2.2979, "step": 41800 }, { "epoch": 2.450722348950108, "grad_norm": 0.40625572204589844, "learning_rate": 0.001, "loss": 2.2987, "step": 41900 }, { "epoch": 2.456571328303211, "grad_norm": 0.4041377007961273, "learning_rate": 0.001, "loss": 2.2936, "step": 42000 }, { "epoch": 2.462420307656314, "grad_norm": 0.20331577956676483, "learning_rate": 0.001, "loss": 2.2997, "step": 42100 }, { "epoch": 2.468269287009417, "grad_norm": 0.26115936040878296, "learning_rate": 0.001, "loss": 2.3008, "step": 42200 }, { "epoch": 2.47411826636252, "grad_norm": 0.19157308340072632, "learning_rate": 0.001, "loss": 2.2916, "step": 42300 }, { "epoch": 2.4799672457156228, "grad_norm": 0.27080652117729187, "learning_rate": 0.001, "loss": 2.2932, "step": 42400 }, { "epoch": 2.4858162250687257, "grad_norm": 0.327761709690094, "learning_rate": 0.001, "loss": 2.2932, "step": 42500 }, { "epoch": 2.491665204421828, "grad_norm": 0.2846272587776184, "learning_rate": 0.001, "loss": 2.2942, "step": 42600 }, { "epoch": 2.497514183774931, "grad_norm": 0.26406973600387573, "learning_rate": 0.001, "loss": 2.2963, "step": 42700 }, { "epoch": 2.503363163128034, "grad_norm": 0.26711305975914, "learning_rate": 0.001, "loss": 2.2916, "step": 42800 }, { "epoch": 2.509212142481137, "grad_norm": 0.3495466709136963, "learning_rate": 0.001, "loss": 2.2919, "step": 42900 }, { "epoch": 2.51506112183424, "grad_norm": 0.30732566118240356, "learning_rate": 0.001, "loss": 2.2888, "step": 43000 }, { "epoch": 2.5209101011873427, "grad_norm": 0.26624858379364014, "learning_rate": 0.001, "loss": 2.2883, "step": 43100 }, { "epoch": 2.5267590805404456, "grad_norm": 0.2762547433376312, "learning_rate": 0.001, "loss": 2.2862, "step": 43200 }, { "epoch": 2.5326080598935485, "grad_norm": 0.3161594271659851, "learning_rate": 0.001, "loss": 2.2961, "step": 43300 }, { "epoch": 2.5384570392466514, "grad_norm": 0.21547982096672058, "learning_rate": 0.001, "loss": 2.2929, "step": 43400 }, { "epoch": 2.5443060185997544, "grad_norm": 0.24329930543899536, "learning_rate": 0.001, "loss": 2.2852, "step": 43500 }, { "epoch": 2.5501549979528573, "grad_norm": 0.19611454010009766, "learning_rate": 0.001, "loss": 2.2943, "step": 43600 }, { "epoch": 2.55600397730596, "grad_norm": 0.21112008392810822, "learning_rate": 0.001, "loss": 2.2922, "step": 43700 }, { "epoch": 2.561852956659063, "grad_norm": 0.2987576425075531, "learning_rate": 0.001, "loss": 2.2837, "step": 43800 }, { "epoch": 2.567701936012166, "grad_norm": 0.265878289937973, "learning_rate": 0.001, "loss": 2.2838, "step": 43900 }, { "epoch": 2.573550915365269, "grad_norm": 0.3082035779953003, "learning_rate": 0.001, "loss": 2.2849, "step": 44000 }, { "epoch": 2.579399894718372, "grad_norm": 0.25639668107032776, "learning_rate": 0.001, "loss": 2.2841, "step": 44100 }, { "epoch": 2.5852488740714747, "grad_norm": 0.25058746337890625, "learning_rate": 0.001, "loss": 2.285, "step": 44200 }, { "epoch": 2.5910978534245777, "grad_norm": 0.2944709062576294, "learning_rate": 0.001, "loss": 2.2906, "step": 44300 }, { "epoch": 2.5969468327776806, "grad_norm": 0.2808932662010193, "learning_rate": 0.001, "loss": 2.2877, "step": 44400 }, { "epoch": 2.602795812130783, "grad_norm": 0.20849645137786865, "learning_rate": 0.001, "loss": 2.2893, "step": 44500 }, { "epoch": 2.608644791483886, "grad_norm": 0.23109427094459534, "learning_rate": 0.001, "loss": 2.2831, "step": 44600 }, { "epoch": 2.614493770836989, "grad_norm": 0.24879351258277893, "learning_rate": 0.001, "loss": 2.2838, "step": 44700 }, { "epoch": 2.6203427501900918, "grad_norm": 0.25297310948371887, "learning_rate": 0.001, "loss": 2.2783, "step": 44800 }, { "epoch": 2.6261917295431947, "grad_norm": 0.27281785011291504, "learning_rate": 0.001, "loss": 2.2824, "step": 44900 }, { "epoch": 2.6320407088962976, "grad_norm": 0.32887884974479675, "learning_rate": 0.001, "loss": 2.2816, "step": 45000 }, { "epoch": 2.6378896882494005, "grad_norm": 0.30834099650382996, "learning_rate": 0.001, "loss": 2.2779, "step": 45100 }, { "epoch": 2.6437386676025034, "grad_norm": 0.22234949469566345, "learning_rate": 0.001, "loss": 2.2757, "step": 45200 }, { "epoch": 2.6495876469556063, "grad_norm": 0.2571384906768799, "learning_rate": 0.001, "loss": 2.2819, "step": 45300 }, { "epoch": 2.6554366263087092, "grad_norm": 0.25584903359413147, "learning_rate": 0.001, "loss": 2.2786, "step": 45400 }, { "epoch": 2.661285605661812, "grad_norm": 0.26488906145095825, "learning_rate": 0.001, "loss": 2.2775, "step": 45500 }, { "epoch": 2.6671345850149146, "grad_norm": 0.20026127994060516, "learning_rate": 0.001, "loss": 2.2801, "step": 45600 }, { "epoch": 2.6729835643680175, "grad_norm": 0.30555716156959534, "learning_rate": 0.001, "loss": 2.2789, "step": 45700 }, { "epoch": 2.6788325437211205, "grad_norm": 0.29843583703041077, "learning_rate": 0.001, "loss": 2.2728, "step": 45800 }, { "epoch": 2.6846815230742234, "grad_norm": 0.2750077247619629, "learning_rate": 0.001, "loss": 2.2729, "step": 45900 }, { "epoch": 2.6905305024273263, "grad_norm": 0.34935322403907776, "learning_rate": 0.001, "loss": 2.2818, "step": 46000 }, { "epoch": 2.696379481780429, "grad_norm": 0.29132622480392456, "learning_rate": 0.001, "loss": 2.2772, "step": 46100 }, { "epoch": 2.702228461133532, "grad_norm": 0.21108755469322205, "learning_rate": 0.001, "loss": 2.2654, "step": 46200 }, { "epoch": 2.708077440486635, "grad_norm": 0.19289465248584747, "learning_rate": 0.001, "loss": 2.2733, "step": 46300 }, { "epoch": 2.713926419839738, "grad_norm": 0.3623882234096527, "learning_rate": 0.001, "loss": 2.2746, "step": 46400 }, { "epoch": 2.719775399192841, "grad_norm": 0.2865779399871826, "learning_rate": 0.001, "loss": 2.2781, "step": 46500 }, { "epoch": 2.7256243785459437, "grad_norm": 0.25996798276901245, "learning_rate": 0.001, "loss": 2.2796, "step": 46600 }, { "epoch": 2.7314733578990467, "grad_norm": 0.2786872982978821, "learning_rate": 0.001, "loss": 2.273, "step": 46700 }, { "epoch": 2.7373223372521496, "grad_norm": 0.3529917597770691, "learning_rate": 0.001, "loss": 2.279, "step": 46800 }, { "epoch": 2.7431713166052525, "grad_norm": 0.3310040533542633, "learning_rate": 0.001, "loss": 2.2748, "step": 46900 }, { "epoch": 2.7490202959583554, "grad_norm": 0.29888084530830383, "learning_rate": 0.001, "loss": 2.2687, "step": 47000 }, { "epoch": 2.7548692753114583, "grad_norm": 0.32164329290390015, "learning_rate": 0.001, "loss": 2.277, "step": 47100 }, { "epoch": 2.760718254664561, "grad_norm": 0.32596132159233093, "learning_rate": 0.001, "loss": 2.2683, "step": 47200 }, { "epoch": 2.766567234017664, "grad_norm": 0.3350686728954315, "learning_rate": 0.001, "loss": 2.2714, "step": 47300 }, { "epoch": 2.772416213370767, "grad_norm": 0.27003711462020874, "learning_rate": 0.001, "loss": 2.2689, "step": 47400 }, { "epoch": 2.77826519272387, "grad_norm": 0.2382701337337494, "learning_rate": 0.001, "loss": 2.268, "step": 47500 }, { "epoch": 2.7841141720769724, "grad_norm": 0.23942124843597412, "learning_rate": 0.001, "loss": 2.2701, "step": 47600 }, { "epoch": 2.7899631514300753, "grad_norm": 0.2385798841714859, "learning_rate": 0.001, "loss": 2.2681, "step": 47700 }, { "epoch": 2.7958121307831783, "grad_norm": 0.2828216552734375, "learning_rate": 0.001, "loss": 2.2718, "step": 47800 }, { "epoch": 2.801661110136281, "grad_norm": 0.305735319852829, "learning_rate": 0.001, "loss": 2.2644, "step": 47900 }, { "epoch": 2.807510089489384, "grad_norm": 0.312160462141037, "learning_rate": 0.001, "loss": 2.2642, "step": 48000 }, { "epoch": 2.813359068842487, "grad_norm": 0.26398301124572754, "learning_rate": 0.001, "loss": 2.2604, "step": 48100 }, { "epoch": 2.81920804819559, "grad_norm": 0.2555355429649353, "learning_rate": 0.001, "loss": 2.2657, "step": 48200 }, { "epoch": 2.825057027548693, "grad_norm": 0.25002944469451904, "learning_rate": 0.001, "loss": 2.2649, "step": 48300 }, { "epoch": 2.8309060069017957, "grad_norm": 0.3330962061882019, "learning_rate": 0.001, "loss": 2.2621, "step": 48400 }, { "epoch": 2.8367549862548986, "grad_norm": 0.24761629104614258, "learning_rate": 0.001, "loss": 2.2652, "step": 48500 }, { "epoch": 2.8426039656080015, "grad_norm": 0.1836179792881012, "learning_rate": 0.001, "loss": 2.2635, "step": 48600 }, { "epoch": 2.848452944961104, "grad_norm": 0.2775278687477112, "learning_rate": 0.001, "loss": 2.2637, "step": 48700 }, { "epoch": 2.854301924314207, "grad_norm": 0.29012468457221985, "learning_rate": 0.001, "loss": 2.2606, "step": 48800 }, { "epoch": 2.86015090366731, "grad_norm": 0.25846928358078003, "learning_rate": 0.001, "loss": 2.2577, "step": 48900 }, { "epoch": 2.8659998830204128, "grad_norm": 0.3181585371494293, "learning_rate": 0.001, "loss": 2.264, "step": 49000 }, { "epoch": 2.8718488623735157, "grad_norm": 0.3909049332141876, "learning_rate": 0.001, "loss": 2.2651, "step": 49100 }, { "epoch": 2.8776978417266186, "grad_norm": 0.2097274512052536, "learning_rate": 0.001, "loss": 2.261, "step": 49200 }, { "epoch": 2.8835468210797215, "grad_norm": 0.2584356665611267, "learning_rate": 0.001, "loss": 2.2646, "step": 49300 }, { "epoch": 2.8893958004328244, "grad_norm": 0.21574875712394714, "learning_rate": 0.001, "loss": 2.2626, "step": 49400 }, { "epoch": 2.8952447797859273, "grad_norm": 0.3638247549533844, "learning_rate": 0.001, "loss": 2.2608, "step": 49500 }, { "epoch": 2.9010937591390302, "grad_norm": 0.28827133774757385, "learning_rate": 0.001, "loss": 2.265, "step": 49600 }, { "epoch": 2.906942738492133, "grad_norm": 0.23058092594146729, "learning_rate": 0.001, "loss": 2.2633, "step": 49700 }, { "epoch": 2.912791717845236, "grad_norm": 0.308886855840683, "learning_rate": 0.001, "loss": 2.2555, "step": 49800 }, { "epoch": 2.918640697198339, "grad_norm": 0.19722002744674683, "learning_rate": 0.001, "loss": 2.254, "step": 49900 }, { "epoch": 2.924489676551442, "grad_norm": 0.2994515001773834, "learning_rate": 0.001, "loss": 2.2587, "step": 50000 }, { "epoch": 2.924489676551442, "eval_ag_news_accuracy": 0.223640625, "eval_ag_news_bleu_score": 4.450678267925785, "eval_ag_news_bleu_score_sem": 0.4263978675810994, "eval_ag_news_emb_cos_sim": 0.593666136264801, "eval_ag_news_emb_cos_sim_sem": 0.01688028685748577, "eval_ag_news_emb_top1_equal": 0.921875, "eval_ag_news_emb_top1_equal_sem": 0.023813825100660324, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.96714448928833, "eval_ag_news_n_ngrams_match_1": 9.9375, "eval_ag_news_n_ngrams_match_2": 2.1875, "eval_ag_news_n_ngrams_match_3": 0.7265625, "eval_ag_news_num_pred_words": 44.6328125, "eval_ag_news_num_true_words": 42.6953125, "eval_ag_news_perplexity": 19.436339603232366, "eval_ag_news_pred_num_tokens": 75.0859375, "eval_ag_news_rouge_score": 0.21969896644109488, "eval_ag_news_runtime": 35.2221, "eval_ag_news_samples_per_second": 14.196, "eval_ag_news_steps_per_second": 0.028, "eval_ag_news_token_set_f1": 0.26153643575653973, "eval_ag_news_token_set_f1_sem": 0.010125289452225001, "eval_ag_news_token_set_precision": 0.23445118505174156, "eval_ag_news_token_set_recall": 0.3118360447607316, "eval_ag_news_true_num_tokens": 59.3515625, "step": 50000 }, { "epoch": 2.924489676551442, "eval_anthropic_toxic_prompts_accuracy": 0.093171875, "eval_anthropic_toxic_prompts_bleu_score": 31.649351424810497, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.515192727636972, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8296329975128174, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.01145751029253006, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.0859375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02487009494926453, "eval_anthropic_toxic_prompts_loss": 1.5353831052780151, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 7.7421875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 4.1875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 2.4375, "eval_anthropic_toxic_prompts_num_pred_words": 14.65625, "eval_anthropic_toxic_prompts_num_true_words": 14.7421875, "eval_anthropic_toxic_prompts_perplexity": 4.643103986907843, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.578125, "eval_anthropic_toxic_prompts_rouge_score": 0.5759574604675454, "eval_anthropic_toxic_prompts_runtime": 29.1597, "eval_anthropic_toxic_prompts_samples_per_second": 17.147, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.5963475781211653, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.020292673312947453, "eval_anthropic_toxic_prompts_token_set_precision": 0.5870016755660531, "eval_anthropic_toxic_prompts_token_set_recall": 0.6143137529750469, "eval_anthropic_toxic_prompts_true_num_tokens": 17.90625, "step": 50000 }, { "epoch": 2.924489676551442, "eval_arxiv_accuracy": 0.354453125, "eval_arxiv_bleu_score": 0.9162884644079639, "eval_arxiv_bleu_score_sem": 0.09709588969452362, "eval_arxiv_emb_cos_sim": 0.294391006231308, "eval_arxiv_emb_cos_sim_sem": 0.014647489413619041, "eval_arxiv_emb_top1_equal": 0.8359375, "eval_arxiv_emb_top1_equal_sem": 0.032861676067113876, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.602428436279297, "eval_arxiv_n_ngrams_match_1": 8.1875, "eval_arxiv_n_ngrams_match_2": 1.2734375, "eval_arxiv_n_ngrams_match_3": 0.1484375, "eval_arxiv_num_pred_words": 44.953125, "eval_arxiv_num_true_words": 86.84375, "eval_arxiv_perplexity": 36.68721892680209, "eval_arxiv_pred_num_tokens": 126.4921875, "eval_arxiv_rouge_score": 0.11370043055153989, "eval_arxiv_runtime": 29.4433, "eval_arxiv_samples_per_second": 16.982, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.1100024658524325, "eval_arxiv_token_set_f1_sem": 0.00799750204335746, "eval_arxiv_token_set_precision": 0.06953367486409505, "eval_arxiv_token_set_recall": 0.34810354980362024, "eval_arxiv_true_num_tokens": 126.5703125, "step": 50000 }, { "epoch": 2.924489676551442, "eval_python_code_alpaca_accuracy": 0.116921875, "eval_python_code_alpaca_bleu_score": 17.35659783690799, "eval_python_code_alpaca_bleu_score_sem": 1.2123408931786623, "eval_python_code_alpaca_emb_cos_sim": 0.7737069129943848, "eval_python_code_alpaca_emb_cos_sim_sem": 0.01349919568747282, "eval_python_code_alpaca_emb_top1_equal": 0.96875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.01543935015797615, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.8545336723327637, "eval_python_code_alpaca_n_ngrams_match_1": 8.1640625, "eval_python_code_alpaca_n_ngrams_match_2": 3.5625, "eval_python_code_alpaca_n_ngrams_match_3": 1.71875, "eval_python_code_alpaca_num_pred_words": 17.125, "eval_python_code_alpaca_num_true_words": 18.328125, "eval_python_code_alpaca_perplexity": 6.388718319752116, "eval_python_code_alpaca_pred_num_tokens": 25.9609375, "eval_python_code_alpaca_rouge_score": 0.4736712459255032, "eval_python_code_alpaca_runtime": 28.1192, "eval_python_code_alpaca_samples_per_second": 17.781, "eval_python_code_alpaca_steps_per_second": 0.036, "eval_python_code_alpaca_token_set_f1": 0.49656739710303843, "eval_python_code_alpaca_token_set_f1_sem": 0.013515733327550846, "eval_python_code_alpaca_token_set_precision": 0.4757238028343423, "eval_python_code_alpaca_token_set_recall": 0.5279870292393534, "eval_python_code_alpaca_true_num_tokens": 23.6015625, "step": 50000 }, { "epoch": 2.924489676551442, "eval_wikibio_accuracy": 0.337125, "eval_wikibio_bleu_score": 4.071469534579809, "eval_wikibio_bleu_score_sem": 0.6047080861500594, "eval_wikibio_emb_cos_sim": 0.3918916583061218, "eval_wikibio_emb_cos_sim_sem": 0.024516111239790916, "eval_wikibio_emb_top1_equal": 0.7109375, "eval_wikibio_emb_top1_equal_sem": 0.04022626578807831, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.8971633911132812, "eval_wikibio_n_ngrams_match_1": 9.921875, "eval_wikibio_n_ngrams_match_2": 2.765625, "eval_wikibio_n_ngrams_match_3": 1.078125, "eval_wikibio_num_pred_words": 47.890625, "eval_wikibio_num_true_words": 52.2734375, "eval_wikibio_perplexity": 18.12266547586016, "eval_wikibio_pred_num_tokens": 115.390625, "eval_wikibio_rouge_score": 0.1748453174251259, "eval_wikibio_runtime": 28.8035, "eval_wikibio_samples_per_second": 17.359, "eval_wikibio_steps_per_second": 0.035, "eval_wikibio_token_set_f1": 0.19784838418662282, "eval_wikibio_token_set_f1_sem": 0.015030918256492873, "eval_wikibio_token_set_precision": 0.1669377467285533, "eval_wikibio_token_set_recall": 0.2772062378495816, "eval_wikibio_true_num_tokens": 102.1171875, "step": 50000 }, { "epoch": 2.924489676551442, "eval_msmarco_accuracy": 0.3513125, "eval_msmarco_bleu_score": 8.911213696522633, "eval_msmarco_bleu_score_sem": 0.8752000565095142, "eval_msmarco_emb_cos_sim": 0.6517541408538818, "eval_msmarco_emb_cos_sim_sem": 0.020483573898673058, "eval_msmarco_emb_top1_equal": 0.90625, "eval_msmarco_emb_top1_equal_sem": 0.025864720344543457, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 2.098381280899048, "eval_msmarco_n_ngrams_match_1": 21.8984375, "eval_msmarco_n_ngrams_match_2": 7.4765625, "eval_msmarco_n_ngrams_match_3": 3.2109375, "eval_msmarco_num_pred_words": 62.2109375, "eval_msmarco_num_true_words": 63.625, "eval_msmarco_perplexity": 8.15296187028775, "eval_msmarco_pred_num_tokens": 90.3359375, "eval_msmarco_rouge_score": 0.3309185933350569, "eval_msmarco_runtime": 24.9527, "eval_msmarco_samples_per_second": 20.038, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.3685048061750078, "eval_msmarco_token_set_f1_sem": 0.014108064201000095, "eval_msmarco_token_set_precision": 0.31790732354119927, "eval_msmarco_token_set_recall": 0.48306491409228175, "eval_msmarco_true_num_tokens": 82.25, "step": 50000 }, { "epoch": 2.930338655904545, "grad_norm": 0.2535139322280884, "learning_rate": 0.001, "loss": 2.2583, "step": 50100 }, { "epoch": 2.9361876352576477, "grad_norm": 0.2370670586824417, "learning_rate": 0.001, "loss": 2.2586, "step": 50200 }, { "epoch": 2.9420366146107506, "grad_norm": 0.19793137907981873, "learning_rate": 0.001, "loss": 2.2581, "step": 50300 }, { "epoch": 2.9478855939638535, "grad_norm": 0.2516927123069763, "learning_rate": 0.001, "loss": 2.2595, "step": 50400 }, { "epoch": 2.9537345733169564, "grad_norm": 0.2974695861339569, "learning_rate": 0.001, "loss": 2.2509, "step": 50500 }, { "epoch": 2.9595835526700593, "grad_norm": 0.2938063144683838, "learning_rate": 0.001, "loss": 2.2623, "step": 50600 }, { "epoch": 2.965432532023162, "grad_norm": 0.2289794534444809, "learning_rate": 0.001, "loss": 2.249, "step": 50700 }, { "epoch": 2.9712815113762647, "grad_norm": 0.33437925577163696, "learning_rate": 0.001, "loss": 2.2508, "step": 50800 }, { "epoch": 2.9771304907293676, "grad_norm": 0.304436594247818, "learning_rate": 0.001, "loss": 2.2515, "step": 50900 }, { "epoch": 2.9829794700824706, "grad_norm": 0.2766879200935364, "learning_rate": 0.001, "loss": 2.2544, "step": 51000 }, { "epoch": 2.9888284494355735, "grad_norm": 0.254157155752182, "learning_rate": 0.001, "loss": 2.2598, "step": 51100 }, { "epoch": 2.9946774287886764, "grad_norm": 0.21902090311050415, "learning_rate": 0.001, "loss": 2.2574, "step": 51200 }, { "epoch": 3.0005264081417793, "grad_norm": 0.35678958892822266, "learning_rate": 0.001, "loss": 2.2536, "step": 51300 }, { "epoch": 3.006375387494882, "grad_norm": 0.20347364246845245, "learning_rate": 0.001, "loss": 2.2382, "step": 51400 }, { "epoch": 3.012224366847985, "grad_norm": 0.20172689855098724, "learning_rate": 0.001, "loss": 2.2385, "step": 51500 }, { "epoch": 3.018073346201088, "grad_norm": 0.2707879841327667, "learning_rate": 0.001, "loss": 2.2385, "step": 51600 }, { "epoch": 3.023922325554191, "grad_norm": 0.22560401260852814, "learning_rate": 0.001, "loss": 2.2365, "step": 51700 }, { "epoch": 3.029771304907294, "grad_norm": 0.2782430052757263, "learning_rate": 0.001, "loss": 2.2425, "step": 51800 }, { "epoch": 3.0356202842603968, "grad_norm": 0.26410114765167236, "learning_rate": 0.001, "loss": 2.24, "step": 51900 }, { "epoch": 3.0414692636134992, "grad_norm": 0.3297409415245056, "learning_rate": 0.001, "loss": 2.2336, "step": 52000 }, { "epoch": 3.047318242966602, "grad_norm": 0.22123177349567413, "learning_rate": 0.001, "loss": 2.2382, "step": 52100 }, { "epoch": 3.053167222319705, "grad_norm": 0.3337898552417755, "learning_rate": 0.001, "loss": 2.2316, "step": 52200 }, { "epoch": 3.059016201672808, "grad_norm": 0.31209108233451843, "learning_rate": 0.001, "loss": 2.2394, "step": 52300 }, { "epoch": 3.064865181025911, "grad_norm": 0.2548010051250458, "learning_rate": 0.001, "loss": 2.2342, "step": 52400 }, { "epoch": 3.070714160379014, "grad_norm": 0.3284762501716614, "learning_rate": 0.001, "loss": 2.2347, "step": 52500 }, { "epoch": 3.0765631397321167, "grad_norm": 0.27913179993629456, "learning_rate": 0.001, "loss": 2.2363, "step": 52600 }, { "epoch": 3.0824121190852196, "grad_norm": 0.3541560769081116, "learning_rate": 0.001, "loss": 2.2343, "step": 52700 }, { "epoch": 3.0882610984383225, "grad_norm": 0.25529077649116516, "learning_rate": 0.001, "loss": 2.2368, "step": 52800 }, { "epoch": 3.0941100777914254, "grad_norm": 0.2586197555065155, "learning_rate": 0.001, "loss": 2.2373, "step": 52900 }, { "epoch": 3.0999590571445284, "grad_norm": 0.2504059076309204, "learning_rate": 0.001, "loss": 2.2365, "step": 53000 }, { "epoch": 3.1058080364976313, "grad_norm": 0.22332555055618286, "learning_rate": 0.001, "loss": 2.2395, "step": 53100 }, { "epoch": 3.111657015850734, "grad_norm": 0.33512699604034424, "learning_rate": 0.001, "loss": 2.2269, "step": 53200 }, { "epoch": 3.117505995203837, "grad_norm": 0.28996923565864563, "learning_rate": 0.001, "loss": 2.2297, "step": 53300 }, { "epoch": 3.12335497455694, "grad_norm": 0.34084635972976685, "learning_rate": 0.001, "loss": 2.2334, "step": 53400 }, { "epoch": 3.129203953910043, "grad_norm": 0.3676254153251648, "learning_rate": 0.001, "loss": 2.2339, "step": 53500 }, { "epoch": 3.1350529332631454, "grad_norm": 0.4085874557495117, "learning_rate": 0.001, "loss": 2.2373, "step": 53600 }, { "epoch": 3.1409019126162483, "grad_norm": 0.300515353679657, "learning_rate": 0.001, "loss": 2.2294, "step": 53700 }, { "epoch": 3.146750891969351, "grad_norm": 0.18238405883312225, "learning_rate": 0.001, "loss": 2.2445, "step": 53800 }, { "epoch": 3.152599871322454, "grad_norm": 0.42770305275917053, "learning_rate": 0.001, "loss": 2.2318, "step": 53900 }, { "epoch": 3.158448850675557, "grad_norm": 0.3630557358264923, "learning_rate": 0.001, "loss": 2.2357, "step": 54000 }, { "epoch": 3.16429783002866, "grad_norm": 0.32010555267333984, "learning_rate": 0.001, "loss": 2.2308, "step": 54100 }, { "epoch": 3.170146809381763, "grad_norm": 0.3513212203979492, "learning_rate": 0.001, "loss": 2.2368, "step": 54200 }, { "epoch": 3.1759957887348658, "grad_norm": 0.4715190827846527, "learning_rate": 0.001, "loss": 2.2327, "step": 54300 }, { "epoch": 3.1818447680879687, "grad_norm": 0.2119813859462738, "learning_rate": 0.001, "loss": 2.2311, "step": 54400 }, { "epoch": 3.1876937474410716, "grad_norm": 0.3242635428905487, "learning_rate": 0.001, "loss": 2.2292, "step": 54500 }, { "epoch": 3.1935427267941745, "grad_norm": 0.3505260944366455, "learning_rate": 0.001, "loss": 2.2328, "step": 54600 }, { "epoch": 3.1993917061472774, "grad_norm": 0.3376452922821045, "learning_rate": 0.001, "loss": 2.2326, "step": 54700 }, { "epoch": 3.2052406855003803, "grad_norm": 0.28784188628196716, "learning_rate": 0.001, "loss": 2.2347, "step": 54800 }, { "epoch": 3.2110896648534832, "grad_norm": 0.36276480555534363, "learning_rate": 0.001, "loss": 2.2258, "step": 54900 }, { "epoch": 3.216938644206586, "grad_norm": 0.21524125337600708, "learning_rate": 0.001, "loss": 2.2309, "step": 55000 }, { "epoch": 3.2227876235596886, "grad_norm": 0.34234869480133057, "learning_rate": 0.001, "loss": 2.2312, "step": 55100 }, { "epoch": 3.2286366029127915, "grad_norm": 0.31745463609695435, "learning_rate": 0.001, "loss": 2.2271, "step": 55200 }, { "epoch": 3.2344855822658944, "grad_norm": 0.25209522247314453, "learning_rate": 0.001, "loss": 2.2274, "step": 55300 }, { "epoch": 3.2403345616189974, "grad_norm": 0.4403448700904846, "learning_rate": 0.001, "loss": 2.2248, "step": 55400 }, { "epoch": 3.2461835409721003, "grad_norm": 0.2708156704902649, "learning_rate": 0.001, "loss": 2.229, "step": 55500 }, { "epoch": 3.252032520325203, "grad_norm": 0.20077025890350342, "learning_rate": 0.001, "loss": 2.2244, "step": 55600 }, { "epoch": 3.257881499678306, "grad_norm": 0.32103246450424194, "learning_rate": 0.001, "loss": 2.2281, "step": 55700 }, { "epoch": 3.263730479031409, "grad_norm": 0.26264041662216187, "learning_rate": 0.001, "loss": 2.2306, "step": 55800 }, { "epoch": 3.269579458384512, "grad_norm": 0.28887516260147095, "learning_rate": 0.001, "loss": 2.2289, "step": 55900 }, { "epoch": 3.275428437737615, "grad_norm": 0.2798577845096588, "learning_rate": 0.001, "loss": 2.2347, "step": 56000 }, { "epoch": 3.2812774170907177, "grad_norm": 0.39745965600013733, "learning_rate": 0.001, "loss": 2.2305, "step": 56100 }, { "epoch": 3.2871263964438207, "grad_norm": 0.3231468200683594, "learning_rate": 0.001, "loss": 2.2224, "step": 56200 }, { "epoch": 3.2929753757969236, "grad_norm": 0.2751029431819916, "learning_rate": 0.001, "loss": 2.2242, "step": 56300 }, { "epoch": 3.2988243551500265, "grad_norm": 0.31753796339035034, "learning_rate": 0.001, "loss": 2.2304, "step": 56400 }, { "epoch": 3.3046733345031294, "grad_norm": 0.3017672300338745, "learning_rate": 0.001, "loss": 2.2288, "step": 56500 }, { "epoch": 3.3105223138562323, "grad_norm": 0.31237781047821045, "learning_rate": 0.001, "loss": 2.2271, "step": 56600 }, { "epoch": 3.316371293209335, "grad_norm": 0.2333793342113495, "learning_rate": 0.001, "loss": 2.2186, "step": 56700 }, { "epoch": 3.3222202725624377, "grad_norm": 0.19521252810955048, "learning_rate": 0.001, "loss": 2.2248, "step": 56800 }, { "epoch": 3.3280692519155406, "grad_norm": 0.25824403762817383, "learning_rate": 0.001, "loss": 2.2234, "step": 56900 }, { "epoch": 3.3339182312686435, "grad_norm": 0.2604947090148926, "learning_rate": 0.001, "loss": 2.2214, "step": 57000 }, { "epoch": 3.3397672106217464, "grad_norm": 0.25063931941986084, "learning_rate": 0.001, "loss": 2.2279, "step": 57100 }, { "epoch": 3.3456161899748493, "grad_norm": 0.25333449244499207, "learning_rate": 0.001, "loss": 2.2194, "step": 57200 }, { "epoch": 3.3514651693279522, "grad_norm": 0.2244035303592682, "learning_rate": 0.001, "loss": 2.2188, "step": 57300 }, { "epoch": 3.357314148681055, "grad_norm": 0.22740265727043152, "learning_rate": 0.001, "loss": 2.2186, "step": 57400 }, { "epoch": 3.363163128034158, "grad_norm": 0.2253979742527008, "learning_rate": 0.001, "loss": 2.2197, "step": 57500 }, { "epoch": 3.369012107387261, "grad_norm": 0.21169120073318481, "learning_rate": 0.001, "loss": 2.2178, "step": 57600 }, { "epoch": 3.374861086740364, "grad_norm": 0.265794038772583, "learning_rate": 0.001, "loss": 2.225, "step": 57700 }, { "epoch": 3.380710066093467, "grad_norm": 0.22755564749240875, "learning_rate": 0.001, "loss": 2.2192, "step": 57800 }, { "epoch": 3.3865590454465697, "grad_norm": 0.26817458868026733, "learning_rate": 0.001, "loss": 2.2209, "step": 57900 }, { "epoch": 3.3924080247996726, "grad_norm": 0.27358779311180115, "learning_rate": 0.001, "loss": 2.224, "step": 58000 }, { "epoch": 3.398257004152775, "grad_norm": 0.26544833183288574, "learning_rate": 0.001, "loss": 2.225, "step": 58100 }, { "epoch": 3.404105983505878, "grad_norm": 0.3383048176765442, "learning_rate": 0.001, "loss": 2.2216, "step": 58200 }, { "epoch": 3.409954962858981, "grad_norm": 0.26683348417282104, "learning_rate": 0.001, "loss": 2.2199, "step": 58300 }, { "epoch": 3.415803942212084, "grad_norm": 0.2360747754573822, "learning_rate": 0.001, "loss": 2.2156, "step": 58400 }, { "epoch": 3.4216529215651867, "grad_norm": 0.3263665735721588, "learning_rate": 0.001, "loss": 2.221, "step": 58500 }, { "epoch": 3.4275019009182897, "grad_norm": 0.2659000754356384, "learning_rate": 0.001, "loss": 2.2151, "step": 58600 }, { "epoch": 3.4333508802713926, "grad_norm": 0.2187013030052185, "learning_rate": 0.001, "loss": 2.2131, "step": 58700 }, { "epoch": 3.4391998596244955, "grad_norm": 0.2579331398010254, "learning_rate": 0.001, "loss": 2.2138, "step": 58800 }, { "epoch": 3.4450488389775984, "grad_norm": 0.29414263367652893, "learning_rate": 0.001, "loss": 2.2205, "step": 58900 }, { "epoch": 3.4508978183307013, "grad_norm": 0.23132246732711792, "learning_rate": 0.001, "loss": 2.2161, "step": 59000 }, { "epoch": 3.456746797683804, "grad_norm": 0.2522469162940979, "learning_rate": 0.001, "loss": 2.2144, "step": 59100 }, { "epoch": 3.462595777036907, "grad_norm": 0.21927669644355774, "learning_rate": 0.001, "loss": 2.2126, "step": 59200 }, { "epoch": 3.46844475639001, "grad_norm": 0.2894374430179596, "learning_rate": 0.001, "loss": 2.217, "step": 59300 }, { "epoch": 3.474293735743113, "grad_norm": 0.28218981623649597, "learning_rate": 0.001, "loss": 2.216, "step": 59400 }, { "epoch": 3.480142715096216, "grad_norm": 0.2399386763572693, "learning_rate": 0.001, "loss": 2.2168, "step": 59500 }, { "epoch": 3.485991694449319, "grad_norm": 0.26047995686531067, "learning_rate": 0.001, "loss": 2.2174, "step": 59600 }, { "epoch": 3.4918406738024217, "grad_norm": 0.27258962392807007, "learning_rate": 0.001, "loss": 2.2131, "step": 59700 }, { "epoch": 3.4976896531555246, "grad_norm": 0.22053499519824982, "learning_rate": 0.001, "loss": 2.2171, "step": 59800 }, { "epoch": 3.5035386325086275, "grad_norm": 0.26189330220222473, "learning_rate": 0.001, "loss": 2.2158, "step": 59900 }, { "epoch": 3.50938761186173, "grad_norm": 0.2722199559211731, "learning_rate": 0.001, "loss": 2.2125, "step": 60000 }, { "epoch": 3.515236591214833, "grad_norm": 0.26557594537734985, "learning_rate": 0.001, "loss": 2.2176, "step": 60100 }, { "epoch": 3.521085570567936, "grad_norm": 0.2289268970489502, "learning_rate": 0.001, "loss": 2.2141, "step": 60200 }, { "epoch": 3.5269345499210387, "grad_norm": 0.2925903797149658, "learning_rate": 0.001, "loss": 2.2143, "step": 60300 }, { "epoch": 3.5327835292741416, "grad_norm": 0.30857157707214355, "learning_rate": 0.001, "loss": 2.2184, "step": 60400 }, { "epoch": 3.5386325086272445, "grad_norm": 0.3331073820590973, "learning_rate": 0.001, "loss": 2.2156, "step": 60500 }, { "epoch": 3.5444814879803475, "grad_norm": 0.21317027509212494, "learning_rate": 0.001, "loss": 2.2163, "step": 60600 }, { "epoch": 3.5503304673334504, "grad_norm": 0.2883853316307068, "learning_rate": 0.001, "loss": 2.2133, "step": 60700 }, { "epoch": 3.5561794466865533, "grad_norm": 0.22872798144817352, "learning_rate": 0.001, "loss": 2.2164, "step": 60800 }, { "epoch": 3.562028426039656, "grad_norm": 0.2483518421649933, "learning_rate": 0.001, "loss": 2.212, "step": 60900 }, { "epoch": 3.567877405392759, "grad_norm": 0.23769812285900116, "learning_rate": 0.001, "loss": 2.2174, "step": 61000 }, { "epoch": 3.5737263847458616, "grad_norm": 0.2736220359802246, "learning_rate": 0.001, "loss": 2.2171, "step": 61100 }, { "epoch": 3.5795753640989645, "grad_norm": 0.2710447311401367, "learning_rate": 0.001, "loss": 2.2066, "step": 61200 }, { "epoch": 3.5854243434520674, "grad_norm": 0.26453039050102234, "learning_rate": 0.001, "loss": 2.2073, "step": 61300 }, { "epoch": 3.5912733228051703, "grad_norm": 0.23133374750614166, "learning_rate": 0.001, "loss": 2.2123, "step": 61400 }, { "epoch": 3.597122302158273, "grad_norm": 0.2897600531578064, "learning_rate": 0.001, "loss": 2.2127, "step": 61500 }, { "epoch": 3.602971281511376, "grad_norm": 0.33596092462539673, "learning_rate": 0.001, "loss": 2.2125, "step": 61600 }, { "epoch": 3.608820260864479, "grad_norm": 0.35423240065574646, "learning_rate": 0.001, "loss": 2.2114, "step": 61700 }, { "epoch": 3.614669240217582, "grad_norm": 0.323722243309021, "learning_rate": 0.001, "loss": 2.214, "step": 61800 }, { "epoch": 3.620518219570685, "grad_norm": 0.3668595850467682, "learning_rate": 0.001, "loss": 2.213, "step": 61900 }, { "epoch": 3.626367198923788, "grad_norm": 0.24255801737308502, "learning_rate": 0.001, "loss": 2.2114, "step": 62000 }, { "epoch": 3.6322161782768907, "grad_norm": 0.20895570516586304, "learning_rate": 0.001, "loss": 2.2046, "step": 62100 }, { "epoch": 3.6380651576299936, "grad_norm": 0.248959481716156, "learning_rate": 0.001, "loss": 2.2067, "step": 62200 }, { "epoch": 3.6439141369830965, "grad_norm": 0.23653684556484222, "learning_rate": 0.001, "loss": 2.2037, "step": 62300 }, { "epoch": 3.6497631163361994, "grad_norm": 0.30436742305755615, "learning_rate": 0.001, "loss": 2.204, "step": 62400 }, { "epoch": 3.6556120956893023, "grad_norm": 0.2230135202407837, "learning_rate": 0.001, "loss": 2.2019, "step": 62500 }, { "epoch": 3.6614610750424053, "grad_norm": 0.2305396944284439, "learning_rate": 0.001, "loss": 2.2118, "step": 62600 }, { "epoch": 3.667310054395508, "grad_norm": 0.28586235642433167, "learning_rate": 0.001, "loss": 2.2029, "step": 62700 }, { "epoch": 3.673159033748611, "grad_norm": 0.23011887073516846, "learning_rate": 0.001, "loss": 2.208, "step": 62800 }, { "epoch": 3.679008013101714, "grad_norm": 0.30121317505836487, "learning_rate": 0.001, "loss": 2.2041, "step": 62900 }, { "epoch": 3.684856992454817, "grad_norm": 0.24861793220043182, "learning_rate": 0.001, "loss": 2.2126, "step": 63000 }, { "epoch": 3.6907059718079194, "grad_norm": 0.2825498878955841, "learning_rate": 0.001, "loss": 2.2089, "step": 63100 }, { "epoch": 3.6965549511610223, "grad_norm": 0.225422203540802, "learning_rate": 0.001, "loss": 2.2076, "step": 63200 }, { "epoch": 3.702403930514125, "grad_norm": 0.21928207576274872, "learning_rate": 0.001, "loss": 2.2038, "step": 63300 }, { "epoch": 3.708252909867228, "grad_norm": 0.27207332849502563, "learning_rate": 0.001, "loss": 2.1986, "step": 63400 }, { "epoch": 3.714101889220331, "grad_norm": 0.34131869673728943, "learning_rate": 0.001, "loss": 2.1989, "step": 63500 }, { "epoch": 3.719950868573434, "grad_norm": 0.24953515827655792, "learning_rate": 0.001, "loss": 2.2019, "step": 63600 }, { "epoch": 3.725799847926537, "grad_norm": 0.2927311360836029, "learning_rate": 0.001, "loss": 2.2015, "step": 63700 }, { "epoch": 3.7316488272796398, "grad_norm": 0.3475852608680725, "learning_rate": 0.001, "loss": 2.2032, "step": 63800 }, { "epoch": 3.7374978066327427, "grad_norm": 0.23926876485347748, "learning_rate": 0.001, "loss": 2.207, "step": 63900 }, { "epoch": 3.7433467859858456, "grad_norm": 0.34814897179603577, "learning_rate": 0.001, "loss": 2.2078, "step": 64000 }, { "epoch": 3.7491957653389485, "grad_norm": 0.21626752614974976, "learning_rate": 0.001, "loss": 2.2042, "step": 64100 }, { "epoch": 3.755044744692051, "grad_norm": 0.24156533181667328, "learning_rate": 0.001, "loss": 2.1996, "step": 64200 }, { "epoch": 3.760893724045154, "grad_norm": 0.30717816948890686, "learning_rate": 0.001, "loss": 2.2041, "step": 64300 }, { "epoch": 3.766742703398257, "grad_norm": 0.2585253119468689, "learning_rate": 0.001, "loss": 2.2004, "step": 64400 }, { "epoch": 3.7725916827513597, "grad_norm": 0.2255106121301651, "learning_rate": 0.001, "loss": 2.2007, "step": 64500 }, { "epoch": 3.7784406621044626, "grad_norm": 0.27079662680625916, "learning_rate": 0.001, "loss": 2.2066, "step": 64600 }, { "epoch": 3.7842896414575655, "grad_norm": 0.37730872631073, "learning_rate": 0.001, "loss": 2.2003, "step": 64700 }, { "epoch": 3.7901386208106684, "grad_norm": 0.24293683469295502, "learning_rate": 0.001, "loss": 2.2055, "step": 64800 }, { "epoch": 3.7959876001637713, "grad_norm": 0.34883764386177063, "learning_rate": 0.001, "loss": 2.1998, "step": 64900 }, { "epoch": 3.8018365795168743, "grad_norm": 0.22495447099208832, "learning_rate": 0.001, "loss": 2.1981, "step": 65000 }, { "epoch": 3.807685558869977, "grad_norm": 0.298450231552124, "learning_rate": 0.001, "loss": 2.2044, "step": 65100 }, { "epoch": 3.81353453822308, "grad_norm": 0.3853253126144409, "learning_rate": 0.001, "loss": 2.2067, "step": 65200 }, { "epoch": 3.819383517576183, "grad_norm": 0.25424304604530334, "learning_rate": 0.001, "loss": 2.2046, "step": 65300 }, { "epoch": 3.825232496929286, "grad_norm": 0.25476765632629395, "learning_rate": 0.001, "loss": 2.2018, "step": 65400 }, { "epoch": 3.831081476282389, "grad_norm": 0.3019183278083801, "learning_rate": 0.001, "loss": 2.1949, "step": 65500 }, { "epoch": 3.8369304556354917, "grad_norm": 0.26399731636047363, "learning_rate": 0.001, "loss": 2.1965, "step": 65600 }, { "epoch": 3.8427794349885946, "grad_norm": 0.3056690990924835, "learning_rate": 0.001, "loss": 2.2002, "step": 65700 }, { "epoch": 3.8486284143416976, "grad_norm": 0.271051824092865, "learning_rate": 0.001, "loss": 2.1998, "step": 65800 }, { "epoch": 3.8544773936948005, "grad_norm": 0.24592196941375732, "learning_rate": 0.001, "loss": 2.2008, "step": 65900 }, { "epoch": 3.8603263730479034, "grad_norm": 0.21775183081626892, "learning_rate": 0.001, "loss": 2.1991, "step": 66000 }, { "epoch": 3.8661753524010063, "grad_norm": 0.2960897386074066, "learning_rate": 0.001, "loss": 2.2004, "step": 66100 }, { "epoch": 3.8720243317541088, "grad_norm": 0.2380296289920807, "learning_rate": 0.001, "loss": 2.1985, "step": 66200 }, { "epoch": 3.8778733111072117, "grad_norm": 0.2665484845638275, "learning_rate": 0.001, "loss": 2.1951, "step": 66300 }, { "epoch": 3.8837222904603146, "grad_norm": 0.26293209195137024, "learning_rate": 0.001, "loss": 2.1995, "step": 66400 }, { "epoch": 3.8895712698134175, "grad_norm": 0.25359994173049927, "learning_rate": 0.001, "loss": 2.1969, "step": 66500 }, { "epoch": 3.8954202491665204, "grad_norm": 0.24254271388053894, "learning_rate": 0.001, "loss": 2.1949, "step": 66600 }, { "epoch": 3.9012692285196233, "grad_norm": 0.2783384621143341, "learning_rate": 0.001, "loss": 2.1963, "step": 66700 }, { "epoch": 3.9071182078727262, "grad_norm": 0.2734212279319763, "learning_rate": 0.001, "loss": 2.2001, "step": 66800 }, { "epoch": 3.912967187225829, "grad_norm": 0.29621100425720215, "learning_rate": 0.001, "loss": 2.2028, "step": 66900 }, { "epoch": 3.918816166578932, "grad_norm": 0.28834739327430725, "learning_rate": 0.001, "loss": 2.1944, "step": 67000 }, { "epoch": 3.924665145932035, "grad_norm": 0.30477920174598694, "learning_rate": 0.001, "loss": 2.193, "step": 67100 }, { "epoch": 3.930514125285138, "grad_norm": 0.3457719087600708, "learning_rate": 0.001, "loss": 2.1953, "step": 67200 }, { "epoch": 3.9363631046382404, "grad_norm": 0.3674408197402954, "learning_rate": 0.001, "loss": 2.1966, "step": 67300 }, { "epoch": 3.9422120839913433, "grad_norm": 0.2328823059797287, "learning_rate": 0.001, "loss": 2.1952, "step": 67400 }, { "epoch": 3.948061063344446, "grad_norm": 0.2557891011238098, "learning_rate": 0.001, "loss": 2.1912, "step": 67500 }, { "epoch": 3.953910042697549, "grad_norm": 0.3163391649723053, "learning_rate": 0.001, "loss": 2.1891, "step": 67600 }, { "epoch": 3.959759022050652, "grad_norm": 0.2363991141319275, "learning_rate": 0.001, "loss": 2.1937, "step": 67700 }, { "epoch": 3.965608001403755, "grad_norm": 0.24695073068141937, "learning_rate": 0.001, "loss": 2.1925, "step": 67800 }, { "epoch": 3.971456980756858, "grad_norm": 0.20647160708904266, "learning_rate": 0.001, "loss": 2.1927, "step": 67900 }, { "epoch": 3.9773059601099607, "grad_norm": 0.20831358432769775, "learning_rate": 0.001, "loss": 2.1993, "step": 68000 }, { "epoch": 3.9831549394630636, "grad_norm": 0.2963276505470276, "learning_rate": 0.001, "loss": 2.1998, "step": 68100 }, { "epoch": 3.9890039188161666, "grad_norm": 0.1993156522512436, "learning_rate": 0.001, "loss": 2.194, "step": 68200 }, { "epoch": 3.9948528981692695, "grad_norm": 0.263876348733902, "learning_rate": 0.001, "loss": 2.1885, "step": 68300 }, { "epoch": 4.000701877522372, "grad_norm": 0.4438401162624359, "learning_rate": 0.001, "loss": 2.1858, "step": 68400 }, { "epoch": 4.006550856875475, "grad_norm": 0.23596565425395966, "learning_rate": 0.001, "loss": 2.1753, "step": 68500 }, { "epoch": 4.012399836228578, "grad_norm": 0.30346983671188354, "learning_rate": 0.001, "loss": 2.1746, "step": 68600 }, { "epoch": 4.018248815581681, "grad_norm": 0.303794801235199, "learning_rate": 0.001, "loss": 2.1757, "step": 68700 }, { "epoch": 4.024097794934784, "grad_norm": 0.24818551540374756, "learning_rate": 0.001, "loss": 2.1742, "step": 68800 }, { "epoch": 4.029946774287887, "grad_norm": 0.26956814527511597, "learning_rate": 0.001, "loss": 2.1801, "step": 68900 }, { "epoch": 4.03579575364099, "grad_norm": 0.2874223589897156, "learning_rate": 0.001, "loss": 2.1757, "step": 69000 }, { "epoch": 4.041644732994093, "grad_norm": 0.32120081782341003, "learning_rate": 0.001, "loss": 2.1784, "step": 69100 }, { "epoch": 4.047493712347196, "grad_norm": 0.26559701561927795, "learning_rate": 0.001, "loss": 2.1761, "step": 69200 }, { "epoch": 4.053342691700299, "grad_norm": 0.2750966250896454, "learning_rate": 0.001, "loss": 2.1793, "step": 69300 }, { "epoch": 4.0591916710534015, "grad_norm": 0.31287461519241333, "learning_rate": 0.001, "loss": 2.1743, "step": 69400 }, { "epoch": 4.065040650406504, "grad_norm": 0.28394412994384766, "learning_rate": 0.001, "loss": 2.1747, "step": 69500 }, { "epoch": 4.070889629759607, "grad_norm": 0.33040472865104675, "learning_rate": 0.001, "loss": 2.1754, "step": 69600 }, { "epoch": 4.07673860911271, "grad_norm": 0.30028486251831055, "learning_rate": 0.001, "loss": 2.1743, "step": 69700 }, { "epoch": 4.082587588465813, "grad_norm": 0.27343881130218506, "learning_rate": 0.001, "loss": 2.1783, "step": 69800 }, { "epoch": 4.088436567818915, "grad_norm": 0.25762441754341125, "learning_rate": 0.001, "loss": 2.1767, "step": 69900 }, { "epoch": 4.094285547172018, "grad_norm": 0.3064455986022949, "learning_rate": 0.001, "loss": 2.1753, "step": 70000 }, { "epoch": 4.100134526525121, "grad_norm": 0.28641772270202637, "learning_rate": 0.001, "loss": 2.174, "step": 70100 }, { "epoch": 4.105983505878224, "grad_norm": 0.2549508810043335, "learning_rate": 0.001, "loss": 2.1793, "step": 70200 }, { "epoch": 4.111832485231327, "grad_norm": 0.31546318531036377, "learning_rate": 0.001, "loss": 2.1746, "step": 70300 }, { "epoch": 4.11768146458443, "grad_norm": 0.28889843821525574, "learning_rate": 0.001, "loss": 2.1772, "step": 70400 }, { "epoch": 4.123530443937533, "grad_norm": 0.29625648260116577, "learning_rate": 0.001, "loss": 2.174, "step": 70500 }, { "epoch": 4.129379423290636, "grad_norm": 0.25656941533088684, "learning_rate": 0.001, "loss": 2.1731, "step": 70600 }, { "epoch": 4.1352284026437385, "grad_norm": 0.29215213656425476, "learning_rate": 0.001, "loss": 2.1739, "step": 70700 }, { "epoch": 4.141077381996841, "grad_norm": 0.2735897898674011, "learning_rate": 0.001, "loss": 2.1771, "step": 70800 }, { "epoch": 4.146926361349944, "grad_norm": 0.25659072399139404, "learning_rate": 0.001, "loss": 2.1743, "step": 70900 }, { "epoch": 4.152775340703047, "grad_norm": 0.25245243310928345, "learning_rate": 0.001, "loss": 2.1687, "step": 71000 }, { "epoch": 4.15862432005615, "grad_norm": 0.32524120807647705, "learning_rate": 0.001, "loss": 2.1745, "step": 71100 }, { "epoch": 4.164473299409253, "grad_norm": 0.2935332655906677, "learning_rate": 0.001, "loss": 2.178, "step": 71200 }, { "epoch": 4.170322278762356, "grad_norm": 0.3075720965862274, "learning_rate": 0.001, "loss": 2.1702, "step": 71300 }, { "epoch": 4.176171258115459, "grad_norm": 0.3025859594345093, "learning_rate": 0.001, "loss": 2.176, "step": 71400 }, { "epoch": 4.182020237468562, "grad_norm": 0.2977749705314636, "learning_rate": 0.001, "loss": 2.1699, "step": 71500 }, { "epoch": 4.187869216821665, "grad_norm": 0.23678414523601532, "learning_rate": 0.001, "loss": 2.1772, "step": 71600 }, { "epoch": 4.193718196174768, "grad_norm": 0.2987750768661499, "learning_rate": 0.001, "loss": 2.1747, "step": 71700 }, { "epoch": 4.1995671755278705, "grad_norm": 0.2326747477054596, "learning_rate": 0.001, "loss": 2.1727, "step": 71800 }, { "epoch": 4.205416154880973, "grad_norm": 0.2699127197265625, "learning_rate": 0.001, "loss": 2.1799, "step": 71900 }, { "epoch": 4.211265134234076, "grad_norm": 0.3086366355419159, "learning_rate": 0.001, "loss": 2.1749, "step": 72000 }, { "epoch": 4.217114113587179, "grad_norm": 0.35205894708633423, "learning_rate": 0.001, "loss": 2.1791, "step": 72100 }, { "epoch": 4.222963092940282, "grad_norm": 0.3430802524089813, "learning_rate": 0.001, "loss": 2.172, "step": 72200 }, { "epoch": 4.228812072293385, "grad_norm": 0.2566537857055664, "learning_rate": 0.001, "loss": 2.173, "step": 72300 }, { "epoch": 4.234661051646488, "grad_norm": 0.27428436279296875, "learning_rate": 0.001, "loss": 2.1683, "step": 72400 }, { "epoch": 4.240510030999591, "grad_norm": 0.26597464084625244, "learning_rate": 0.001, "loss": 2.1748, "step": 72500 }, { "epoch": 4.246359010352694, "grad_norm": 0.3265697956085205, "learning_rate": 0.001, "loss": 2.1757, "step": 72600 }, { "epoch": 4.252207989705797, "grad_norm": 0.25354719161987305, "learning_rate": 0.001, "loss": 2.1723, "step": 72700 }, { "epoch": 4.2580569690589, "grad_norm": 0.27575889229774475, "learning_rate": 0.001, "loss": 2.1741, "step": 72800 }, { "epoch": 4.2639059484120025, "grad_norm": 0.2788533568382263, "learning_rate": 0.001, "loss": 2.1718, "step": 72900 }, { "epoch": 4.269754927765105, "grad_norm": 0.274217814207077, "learning_rate": 0.001, "loss": 2.1668, "step": 73000 }, { "epoch": 4.2756039071182075, "grad_norm": 0.2816111445426941, "learning_rate": 0.001, "loss": 2.1716, "step": 73100 }, { "epoch": 4.28145288647131, "grad_norm": 0.4137522876262665, "learning_rate": 0.001, "loss": 2.1724, "step": 73200 }, { "epoch": 4.287301865824413, "grad_norm": 0.34567728638648987, "learning_rate": 0.001, "loss": 2.1731, "step": 73300 }, { "epoch": 4.293150845177516, "grad_norm": 0.2799455225467682, "learning_rate": 0.001, "loss": 2.1737, "step": 73400 }, { "epoch": 4.298999824530619, "grad_norm": 0.20527416467666626, "learning_rate": 0.001, "loss": 2.1708, "step": 73500 }, { "epoch": 4.304848803883722, "grad_norm": 0.27814435958862305, "learning_rate": 0.001, "loss": 2.1708, "step": 73600 }, { "epoch": 4.310697783236825, "grad_norm": 0.3121950030326843, "learning_rate": 0.001, "loss": 2.1698, "step": 73700 }, { "epoch": 4.316546762589928, "grad_norm": 0.25618159770965576, "learning_rate": 0.001, "loss": 2.1647, "step": 73800 }, { "epoch": 4.322395741943031, "grad_norm": 0.2395668774843216, "learning_rate": 0.001, "loss": 2.1653, "step": 73900 }, { "epoch": 4.328244721296134, "grad_norm": 0.37087202072143555, "learning_rate": 0.001, "loss": 2.1665, "step": 74000 }, { "epoch": 4.334093700649237, "grad_norm": 0.32888302206993103, "learning_rate": 0.001, "loss": 2.1688, "step": 74100 }, { "epoch": 4.3399426800023395, "grad_norm": 0.28976815938949585, "learning_rate": 0.001, "loss": 2.168, "step": 74200 }, { "epoch": 4.345791659355442, "grad_norm": 0.2903531491756439, "learning_rate": 0.001, "loss": 2.1719, "step": 74300 }, { "epoch": 4.351640638708545, "grad_norm": 0.296682745218277, "learning_rate": 0.001, "loss": 2.1672, "step": 74400 }, { "epoch": 4.357489618061648, "grad_norm": 0.2816332280635834, "learning_rate": 0.001, "loss": 2.1785, "step": 74500 }, { "epoch": 4.363338597414751, "grad_norm": 0.2604549825191498, "learning_rate": 0.001, "loss": 2.1678, "step": 74600 }, { "epoch": 4.369187576767854, "grad_norm": 0.26231488585472107, "learning_rate": 0.001, "loss": 2.1706, "step": 74700 }, { "epoch": 4.375036556120957, "grad_norm": 0.32147058844566345, "learning_rate": 0.001, "loss": 2.1678, "step": 74800 }, { "epoch": 4.38088553547406, "grad_norm": 0.34617045521736145, "learning_rate": 0.001, "loss": 2.1696, "step": 74900 }, { "epoch": 4.386734514827163, "grad_norm": 0.24648459255695343, "learning_rate": 0.001, "loss": 2.1715, "step": 75000 }, { "epoch": 4.386734514827163, "eval_ag_news_accuracy": 0.2241875, "eval_ag_news_bleu_score": 5.0820913832930295, "eval_ag_news_bleu_score_sem": 0.3777105178673791, "eval_ag_news_emb_cos_sim": 0.6239420175552368, "eval_ag_news_emb_cos_sim_sem": 0.015652703121304512, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.9160425662994385, "eval_ag_news_n_ngrams_match_1": 12.46875, "eval_ag_news_n_ngrams_match_2": 3.0390625, "eval_ag_news_n_ngrams_match_3": 0.9921875, "eval_ag_news_num_pred_words": 48.1875, "eval_ag_news_num_true_words": 45.25, "eval_ag_news_perplexity": 18.46805654272548, "eval_ag_news_pred_num_tokens": 76.0546875, "eval_ag_news_rouge_score": 0.25515468262259583, "eval_ag_news_runtime": 38.0887, "eval_ag_news_samples_per_second": 13.127, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.2911871644475106, "eval_ag_news_token_set_f1_sem": 0.010237893344056996, "eval_ag_news_token_set_precision": 0.2677595753254693, "eval_ag_news_token_set_recall": 0.3371729905465722, "eval_ag_news_true_num_tokens": 62.7109375, "step": 75000 }, { "epoch": 4.386734514827163, "eval_anthropic_toxic_prompts_accuracy": 0.097890625, "eval_anthropic_toxic_prompts_bleu_score": 32.05658030708733, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.331433019724783, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8580397367477417, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009920100681483746, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.0625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.021479481144483458, "eval_anthropic_toxic_prompts_loss": 1.4571199417114258, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 7.4296875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 4.0703125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 2.375, "eval_anthropic_toxic_prompts_num_pred_words": 13.015625, "eval_anthropic_toxic_prompts_num_true_words": 13.0234375, "eval_anthropic_toxic_prompts_perplexity": 4.293575955181468, "eval_anthropic_toxic_prompts_pred_num_tokens": 16.6875, "eval_anthropic_toxic_prompts_rouge_score": 0.6127434341079054, "eval_anthropic_toxic_prompts_runtime": 29.9948, "eval_anthropic_toxic_prompts_samples_per_second": 16.67, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.6336150658409416, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017776811652596102, "eval_anthropic_toxic_prompts_token_set_precision": 0.6245947415040458, "eval_anthropic_toxic_prompts_token_set_recall": 0.6490670105615479, "eval_anthropic_toxic_prompts_true_num_tokens": 16.03125, "step": 75000 }, { "epoch": 4.386734514827163, "eval_arxiv_accuracy": 0.359328125, "eval_arxiv_bleu_score": 1.0704268825672096, "eval_arxiv_bleu_score_sem": 0.09716147866164637, "eval_arxiv_emb_cos_sim": 0.33179235458374023, "eval_arxiv_emb_cos_sim_sem": 0.015915632247924805, "eval_arxiv_emb_top1_equal": 0.8046875, "eval_arxiv_emb_top1_equal_sem": 0.03517845645546913, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.559847354888916, "eval_arxiv_n_ngrams_match_1": 9.4140625, "eval_arxiv_n_ngrams_match_2": 1.546875, "eval_arxiv_n_ngrams_match_3": 0.203125, "eval_arxiv_num_pred_words": 48.5703125, "eval_arxiv_num_true_words": 86.3203125, "eval_arxiv_perplexity": 35.15783006461205, "eval_arxiv_pred_num_tokens": 126.40625, "eval_arxiv_rouge_score": 0.1308382545516098, "eval_arxiv_runtime": 30.4041, "eval_arxiv_samples_per_second": 16.445, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.12253000051850885, "eval_arxiv_token_set_f1_sem": 0.0076239094098683475, "eval_arxiv_token_set_precision": 0.07724060419528161, "eval_arxiv_token_set_recall": 0.3619224119476593, "eval_arxiv_true_num_tokens": 124.515625, "step": 75000 }, { "epoch": 4.386734514827163, "eval_python_code_alpaca_accuracy": 0.12259375, "eval_python_code_alpaca_bleu_score": 20.650364879426593, "eval_python_code_alpaca_bleu_score_sem": 1.5890638501297312, "eval_python_code_alpaca_emb_cos_sim": 0.8163971900939941, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011218786239624023, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.7168112993240356, "eval_python_code_alpaca_n_ngrams_match_1": 9.2109375, "eval_python_code_alpaca_n_ngrams_match_2": 4.25, "eval_python_code_alpaca_n_ngrams_match_3": 2.1171875, "eval_python_code_alpaca_num_pred_words": 16.7578125, "eval_python_code_alpaca_num_true_words": 18.1171875, "eval_python_code_alpaca_perplexity": 5.566749435651853, "eval_python_code_alpaca_pred_num_tokens": 22.4609375, "eval_python_code_alpaca_rouge_score": 0.5343721461319394, "eval_python_code_alpaca_runtime": 29.0746, "eval_python_code_alpaca_samples_per_second": 17.197, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.5475895316288464, "eval_python_code_alpaca_token_set_f1_sem": 0.015512929536293393, "eval_python_code_alpaca_token_set_precision": 0.5257202753062461, "eval_python_code_alpaca_token_set_recall": 0.5775209661281213, "eval_python_code_alpaca_true_num_tokens": 23.734375, "step": 75000 }, { "epoch": 4.386734514827163, "eval_wikibio_accuracy": 0.3518125, "eval_wikibio_bleu_score": 3.985918034242436, "eval_wikibio_bleu_score_sem": 0.5479142379678461, "eval_wikibio_emb_cos_sim": 0.4079264998435974, "eval_wikibio_emb_cos_sim_sem": 0.023436851799488068, "eval_wikibio_emb_top1_equal": 0.7578125, "eval_wikibio_emb_top1_equal_sem": 0.03801498934626579, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.843187093734741, "eval_wikibio_n_ngrams_match_1": 10.9765625, "eval_wikibio_n_ngrams_match_2": 3.046875, "eval_wikibio_n_ngrams_match_3": 1.1328125, "eval_wikibio_num_pred_words": 49.5234375, "eval_wikibio_num_true_words": 53.109375, "eval_wikibio_perplexity": 17.170402105939786, "eval_wikibio_pred_num_tokens": 115.2890625, "eval_wikibio_rouge_score": 0.19501220766171967, "eval_wikibio_runtime": 30.2875, "eval_wikibio_samples_per_second": 16.508, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2214034298116783, "eval_wikibio_token_set_f1_sem": 0.014500869932284654, "eval_wikibio_token_set_precision": 0.18545923249508844, "eval_wikibio_token_set_recall": 0.3370688211548325, "eval_wikibio_true_num_tokens": 100.9765625, "step": 75000 }, { "epoch": 4.386734514827163, "eval_msmarco_accuracy": 0.356015625, "eval_msmarco_bleu_score": 11.084157681783452, "eval_msmarco_bleu_score_sem": 1.1649931037806336, "eval_msmarco_emb_cos_sim": 0.6933435797691345, "eval_msmarco_emb_cos_sim_sem": 0.019720003008842468, "eval_msmarco_emb_top1_equal": 0.9140625, "eval_msmarco_emb_top1_equal_sem": 0.024870097637176514, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 2.0393717288970947, "eval_msmarco_n_ngrams_match_1": 24.296875, "eval_msmarco_n_ngrams_match_2": 9.5078125, "eval_msmarco_n_ngrams_match_3": 4.7265625, "eval_msmarco_num_pred_words": 65.5546875, "eval_msmarco_num_true_words": 64.671875, "eval_msmarco_perplexity": 7.685778928874022, "eval_msmarco_pred_num_tokens": 93.6328125, "eval_msmarco_rouge_score": 0.3516144411630836, "eval_msmarco_runtime": 25.4054, "eval_msmarco_samples_per_second": 19.681, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.3937374244256042, "eval_msmarco_token_set_f1_sem": 0.014541070700768289, "eval_msmarco_token_set_precision": 0.34838941990971717, "eval_msmarco_token_set_recall": 0.48195896952894873, "eval_msmarco_true_num_tokens": 84.9140625, "step": 75000 }, { "epoch": 4.392583494180266, "grad_norm": 0.2868218719959259, "learning_rate": 0.001, "loss": 2.1654, "step": 75100 }, { "epoch": 4.398432473533369, "grad_norm": 0.2988479733467102, "learning_rate": 0.001, "loss": 2.1644, "step": 75200 }, { "epoch": 4.4042814528864715, "grad_norm": 0.2833825945854187, "learning_rate": 0.001, "loss": 2.1706, "step": 75300 }, { "epoch": 4.4101304322395745, "grad_norm": 0.3174901604652405, "learning_rate": 0.001, "loss": 2.1667, "step": 75400 }, { "epoch": 4.415979411592677, "grad_norm": 0.3228948712348938, "learning_rate": 0.001, "loss": 2.164, "step": 75500 }, { "epoch": 4.42182839094578, "grad_norm": 0.3032263517379761, "learning_rate": 0.001, "loss": 2.1655, "step": 75600 }, { "epoch": 4.427677370298883, "grad_norm": 0.21947941184043884, "learning_rate": 0.001, "loss": 2.1625, "step": 75700 }, { "epoch": 4.433526349651986, "grad_norm": 0.2726324498653412, "learning_rate": 0.001, "loss": 2.1656, "step": 75800 }, { "epoch": 4.439375329005089, "grad_norm": 0.2721160054206848, "learning_rate": 0.001, "loss": 2.1682, "step": 75900 }, { "epoch": 4.445224308358192, "grad_norm": 0.29018092155456543, "learning_rate": 0.001, "loss": 2.1681, "step": 76000 }, { "epoch": 4.451073287711294, "grad_norm": 0.30496954917907715, "learning_rate": 0.001, "loss": 2.1696, "step": 76100 }, { "epoch": 4.456922267064397, "grad_norm": 0.24247199296951294, "learning_rate": 0.001, "loss": 2.1623, "step": 76200 }, { "epoch": 4.4627712464175, "grad_norm": 0.24091088771820068, "learning_rate": 0.001, "loss": 2.166, "step": 76300 }, { "epoch": 4.468620225770603, "grad_norm": 0.28146350383758545, "learning_rate": 0.001, "loss": 2.1601, "step": 76400 }, { "epoch": 4.474469205123706, "grad_norm": 0.3153410851955414, "learning_rate": 0.001, "loss": 2.1656, "step": 76500 }, { "epoch": 4.4803181844768085, "grad_norm": 0.2522837817668915, "learning_rate": 0.001, "loss": 2.1635, "step": 76600 }, { "epoch": 4.486167163829911, "grad_norm": 0.32681453227996826, "learning_rate": 0.001, "loss": 2.16, "step": 76700 }, { "epoch": 4.492016143183014, "grad_norm": 0.30181339383125305, "learning_rate": 0.001, "loss": 2.1598, "step": 76800 }, { "epoch": 4.497865122536117, "grad_norm": 0.24497081339359283, "learning_rate": 0.001, "loss": 2.163, "step": 76900 }, { "epoch": 4.50371410188922, "grad_norm": 0.26367321610450745, "learning_rate": 0.001, "loss": 2.1626, "step": 77000 }, { "epoch": 4.509563081242323, "grad_norm": 0.32810741662979126, "learning_rate": 0.001, "loss": 2.163, "step": 77100 }, { "epoch": 4.515412060595426, "grad_norm": 0.2333637773990631, "learning_rate": 0.001, "loss": 2.1662, "step": 77200 }, { "epoch": 4.521261039948529, "grad_norm": 0.3019060492515564, "learning_rate": 0.001, "loss": 2.1602, "step": 77300 }, { "epoch": 4.527110019301632, "grad_norm": 0.2723366320133209, "learning_rate": 0.001, "loss": 2.1634, "step": 77400 }, { "epoch": 4.532958998654735, "grad_norm": 0.2561230957508087, "learning_rate": 0.001, "loss": 2.1632, "step": 77500 }, { "epoch": 4.538807978007838, "grad_norm": 0.2674855589866638, "learning_rate": 0.001, "loss": 2.165, "step": 77600 }, { "epoch": 4.5446569573609406, "grad_norm": 0.23991751670837402, "learning_rate": 0.001, "loss": 2.1673, "step": 77700 }, { "epoch": 4.5505059367140435, "grad_norm": 0.270466685295105, "learning_rate": 0.001, "loss": 2.16, "step": 77800 }, { "epoch": 4.556354916067146, "grad_norm": 0.24644166231155396, "learning_rate": 0.001, "loss": 2.1651, "step": 77900 }, { "epoch": 4.562203895420249, "grad_norm": 0.24498938024044037, "learning_rate": 0.001, "loss": 2.1615, "step": 78000 }, { "epoch": 4.568052874773352, "grad_norm": 0.3061765134334564, "learning_rate": 0.001, "loss": 2.1637, "step": 78100 }, { "epoch": 4.573901854126455, "grad_norm": 0.305037260055542, "learning_rate": 0.001, "loss": 2.1661, "step": 78200 }, { "epoch": 4.579750833479558, "grad_norm": 0.28959745168685913, "learning_rate": 0.001, "loss": 2.1615, "step": 78300 }, { "epoch": 4.585599812832661, "grad_norm": 0.23746539652347565, "learning_rate": 0.001, "loss": 2.1554, "step": 78400 }, { "epoch": 4.591448792185764, "grad_norm": 0.2902107834815979, "learning_rate": 0.001, "loss": 2.1603, "step": 78500 }, { "epoch": 4.597297771538867, "grad_norm": 0.2759037911891937, "learning_rate": 0.001, "loss": 2.1619, "step": 78600 }, { "epoch": 4.60314675089197, "grad_norm": 0.22513563930988312, "learning_rate": 0.001, "loss": 2.1608, "step": 78700 }, { "epoch": 4.608995730245073, "grad_norm": 0.2616240978240967, "learning_rate": 0.001, "loss": 2.1593, "step": 78800 }, { "epoch": 4.6148447095981755, "grad_norm": 0.3201628029346466, "learning_rate": 0.001, "loss": 2.1578, "step": 78900 }, { "epoch": 4.6206936889512775, "grad_norm": 0.40099528431892395, "learning_rate": 0.001, "loss": 2.1634, "step": 79000 }, { "epoch": 4.626542668304381, "grad_norm": 0.2671538293361664, "learning_rate": 0.001, "loss": 2.161, "step": 79100 }, { "epoch": 4.632391647657483, "grad_norm": 0.31046539545059204, "learning_rate": 0.001, "loss": 2.1564, "step": 79200 }, { "epoch": 4.638240627010587, "grad_norm": 0.2286563366651535, "learning_rate": 0.001, "loss": 2.1608, "step": 79300 }, { "epoch": 4.644089606363689, "grad_norm": 0.29931363463401794, "learning_rate": 0.001, "loss": 2.1632, "step": 79400 }, { "epoch": 4.649938585716792, "grad_norm": 0.282732754945755, "learning_rate": 0.001, "loss": 2.1616, "step": 79500 }, { "epoch": 4.655787565069895, "grad_norm": 0.2737692594528198, "learning_rate": 0.001, "loss": 2.1578, "step": 79600 }, { "epoch": 4.661636544422998, "grad_norm": 0.25211572647094727, "learning_rate": 0.001, "loss": 2.1546, "step": 79700 }, { "epoch": 4.667485523776101, "grad_norm": 0.34287339448928833, "learning_rate": 0.001, "loss": 2.1618, "step": 79800 }, { "epoch": 4.673334503129204, "grad_norm": 0.29630139470100403, "learning_rate": 0.001, "loss": 2.1595, "step": 79900 }, { "epoch": 4.679183482482307, "grad_norm": 0.2743169665336609, "learning_rate": 0.001, "loss": 2.1568, "step": 80000 }, { "epoch": 4.68503246183541, "grad_norm": 0.25458964705467224, "learning_rate": 0.001, "loss": 2.1651, "step": 80100 }, { "epoch": 4.6908814411885125, "grad_norm": 0.33882302045822144, "learning_rate": 0.001, "loss": 2.1616, "step": 80200 }, { "epoch": 4.696730420541615, "grad_norm": 0.30472755432128906, "learning_rate": 0.001, "loss": 2.1602, "step": 80300 }, { "epoch": 4.702579399894718, "grad_norm": 0.3309895396232605, "learning_rate": 0.001, "loss": 2.1582, "step": 80400 }, { "epoch": 4.708428379247821, "grad_norm": 0.2965795397758484, "learning_rate": 0.001, "loss": 2.16, "step": 80500 }, { "epoch": 4.714277358600924, "grad_norm": 0.31514406204223633, "learning_rate": 0.001, "loss": 2.1575, "step": 80600 }, { "epoch": 4.720126337954027, "grad_norm": 0.21959808468818665, "learning_rate": 0.001, "loss": 2.1605, "step": 80700 }, { "epoch": 4.72597531730713, "grad_norm": 0.2702164351940155, "learning_rate": 0.001, "loss": 2.1572, "step": 80800 }, { "epoch": 4.731824296660233, "grad_norm": 0.31828320026397705, "learning_rate": 0.001, "loss": 2.1565, "step": 80900 }, { "epoch": 4.737673276013336, "grad_norm": 0.3414333164691925, "learning_rate": 0.001, "loss": 2.1499, "step": 81000 }, { "epoch": 4.743522255366439, "grad_norm": 0.2650374472141266, "learning_rate": 0.001, "loss": 2.155, "step": 81100 }, { "epoch": 4.749371234719542, "grad_norm": 0.24574407935142517, "learning_rate": 0.001, "loss": 2.1496, "step": 81200 }, { "epoch": 4.7552202140726445, "grad_norm": 0.30593767762184143, "learning_rate": 0.001, "loss": 2.1556, "step": 81300 }, { "epoch": 4.761069193425747, "grad_norm": 0.35799187421798706, "learning_rate": 0.001, "loss": 2.1533, "step": 81400 }, { "epoch": 4.76691817277885, "grad_norm": 0.30872172117233276, "learning_rate": 0.001, "loss": 2.1571, "step": 81500 }, { "epoch": 4.772767152131953, "grad_norm": 0.2839216887950897, "learning_rate": 0.001, "loss": 2.1591, "step": 81600 }, { "epoch": 4.778616131485056, "grad_norm": 0.21116366982460022, "learning_rate": 0.001, "loss": 2.1517, "step": 81700 }, { "epoch": 4.784465110838159, "grad_norm": 0.22412459552288055, "learning_rate": 0.001, "loss": 2.1596, "step": 81800 }, { "epoch": 4.790314090191262, "grad_norm": 0.18584126234054565, "learning_rate": 0.001, "loss": 2.1521, "step": 81900 }, { "epoch": 4.796163069544365, "grad_norm": 0.24978245794773102, "learning_rate": 0.001, "loss": 2.1545, "step": 82000 }, { "epoch": 4.802012048897467, "grad_norm": 0.2736203372478485, "learning_rate": 0.001, "loss": 2.1623, "step": 82100 }, { "epoch": 4.807861028250571, "grad_norm": 0.3801518678665161, "learning_rate": 0.001, "loss": 2.1514, "step": 82200 }, { "epoch": 4.813710007603673, "grad_norm": 0.34937232732772827, "learning_rate": 0.001, "loss": 2.1512, "step": 82300 }, { "epoch": 4.8195589869567765, "grad_norm": 0.4196600914001465, "learning_rate": 0.001, "loss": 2.153, "step": 82400 }, { "epoch": 4.825407966309879, "grad_norm": 0.28435975313186646, "learning_rate": 0.001, "loss": 2.1543, "step": 82500 }, { "epoch": 4.8312569456629815, "grad_norm": 0.2439228594303131, "learning_rate": 0.001, "loss": 2.1554, "step": 82600 }, { "epoch": 4.837105925016084, "grad_norm": 0.22179903090000153, "learning_rate": 0.001, "loss": 2.1503, "step": 82700 }, { "epoch": 4.842954904369187, "grad_norm": 0.24325183033943176, "learning_rate": 0.001, "loss": 2.1577, "step": 82800 }, { "epoch": 4.84880388372229, "grad_norm": 0.2749015986919403, "learning_rate": 0.001, "loss": 2.15, "step": 82900 }, { "epoch": 4.854652863075393, "grad_norm": 0.2699997127056122, "learning_rate": 0.001, "loss": 2.1546, "step": 83000 }, { "epoch": 4.860501842428496, "grad_norm": 0.285241037607193, "learning_rate": 0.001, "loss": 2.1526, "step": 83100 }, { "epoch": 4.866350821781599, "grad_norm": 0.3270174562931061, "learning_rate": 0.001, "loss": 2.154, "step": 83200 }, { "epoch": 4.872199801134702, "grad_norm": 0.21459585428237915, "learning_rate": 0.001, "loss": 2.1593, "step": 83300 }, { "epoch": 4.878048780487805, "grad_norm": 0.2925507426261902, "learning_rate": 0.001, "loss": 2.1566, "step": 83400 }, { "epoch": 4.883897759840908, "grad_norm": 0.35507532954216003, "learning_rate": 0.001, "loss": 2.1559, "step": 83500 }, { "epoch": 4.889746739194011, "grad_norm": 0.1994827836751938, "learning_rate": 0.001, "loss": 2.1486, "step": 83600 }, { "epoch": 4.8955957185471135, "grad_norm": 0.33300986886024475, "learning_rate": 0.001, "loss": 2.1528, "step": 83700 }, { "epoch": 4.901444697900216, "grad_norm": 0.31174710392951965, "learning_rate": 0.001, "loss": 2.1562, "step": 83800 }, { "epoch": 4.907293677253319, "grad_norm": 0.23163671791553497, "learning_rate": 0.001, "loss": 2.1508, "step": 83900 }, { "epoch": 4.913142656606422, "grad_norm": 0.2668754756450653, "learning_rate": 0.001, "loss": 2.1524, "step": 84000 }, { "epoch": 4.918991635959525, "grad_norm": 0.32037925720214844, "learning_rate": 0.001, "loss": 2.1448, "step": 84100 }, { "epoch": 4.924840615312628, "grad_norm": 0.26700419187545776, "learning_rate": 0.001, "loss": 2.1496, "step": 84200 }, { "epoch": 4.930689594665731, "grad_norm": 0.3132065534591675, "learning_rate": 0.001, "loss": 2.154, "step": 84300 }, { "epoch": 4.936538574018834, "grad_norm": 0.3133326768875122, "learning_rate": 0.001, "loss": 2.148, "step": 84400 }, { "epoch": 4.942387553371937, "grad_norm": 0.21981407701969147, "learning_rate": 0.001, "loss": 2.1525, "step": 84500 }, { "epoch": 4.94823653272504, "grad_norm": 0.3036979138851166, "learning_rate": 0.001, "loss": 2.1488, "step": 84600 }, { "epoch": 4.954085512078143, "grad_norm": 0.2901677191257477, "learning_rate": 0.001, "loss": 2.1526, "step": 84700 }, { "epoch": 4.9599344914312455, "grad_norm": 0.3521074652671814, "learning_rate": 0.001, "loss": 2.1536, "step": 84800 }, { "epoch": 4.9657834707843485, "grad_norm": 0.25744739174842834, "learning_rate": 0.001, "loss": 2.1472, "step": 84900 }, { "epoch": 4.971632450137451, "grad_norm": 0.2530853748321533, "learning_rate": 0.001, "loss": 2.1535, "step": 85000 }, { "epoch": 4.977481429490554, "grad_norm": 0.19626569747924805, "learning_rate": 0.001, "loss": 2.1497, "step": 85100 }, { "epoch": 4.983330408843656, "grad_norm": 0.2600352168083191, "learning_rate": 0.001, "loss": 2.1511, "step": 85200 }, { "epoch": 4.98917938819676, "grad_norm": 0.3138558566570282, "learning_rate": 0.001, "loss": 2.1524, "step": 85300 }, { "epoch": 4.995028367549862, "grad_norm": 0.3762775957584381, "learning_rate": 0.001, "loss": 2.1505, "step": 85400 }, { "epoch": 5.000877346902965, "grad_norm": 0.23831824958324432, "learning_rate": 0.001, "loss": 2.1557, "step": 85500 }, { "epoch": 5.006726326256068, "grad_norm": 0.2852379381656647, "learning_rate": 0.001, "loss": 2.1374, "step": 85600 }, { "epoch": 5.012575305609171, "grad_norm": 0.22791814804077148, "learning_rate": 0.001, "loss": 2.1352, "step": 85700 }, { "epoch": 5.018424284962274, "grad_norm": 0.18622800707817078, "learning_rate": 0.001, "loss": 2.1326, "step": 85800 }, { "epoch": 5.024273264315377, "grad_norm": 0.2897436022758484, "learning_rate": 0.001, "loss": 2.1342, "step": 85900 }, { "epoch": 5.03012224366848, "grad_norm": 0.25512295961380005, "learning_rate": 0.001, "loss": 2.1357, "step": 86000 }, { "epoch": 5.0359712230215825, "grad_norm": 0.2436899095773697, "learning_rate": 0.001, "loss": 2.1308, "step": 86100 }, { "epoch": 5.041820202374685, "grad_norm": 0.2661168873310089, "learning_rate": 0.001, "loss": 2.132, "step": 86200 }, { "epoch": 5.047669181727788, "grad_norm": 0.22614355385303497, "learning_rate": 0.001, "loss": 2.1331, "step": 86300 }, { "epoch": 5.053518161080891, "grad_norm": 0.24423633515834808, "learning_rate": 0.001, "loss": 2.1359, "step": 86400 }, { "epoch": 5.059367140433994, "grad_norm": 0.22323179244995117, "learning_rate": 0.001, "loss": 2.1315, "step": 86500 }, { "epoch": 5.065216119787097, "grad_norm": 0.28088101744651794, "learning_rate": 0.001, "loss": 2.136, "step": 86600 }, { "epoch": 5.0710650991402, "grad_norm": 0.21173475682735443, "learning_rate": 0.001, "loss": 2.1382, "step": 86700 }, { "epoch": 5.076914078493303, "grad_norm": 0.2598838806152344, "learning_rate": 0.001, "loss": 2.1327, "step": 86800 }, { "epoch": 5.082763057846406, "grad_norm": 0.31291359663009644, "learning_rate": 0.001, "loss": 2.1346, "step": 86900 }, { "epoch": 5.088612037199509, "grad_norm": 0.2614854574203491, "learning_rate": 0.001, "loss": 2.1306, "step": 87000 }, { "epoch": 5.094461016552612, "grad_norm": 0.20978161692619324, "learning_rate": 0.001, "loss": 2.1257, "step": 87100 }, { "epoch": 5.1003099959057145, "grad_norm": 0.22881488502025604, "learning_rate": 0.001, "loss": 2.1374, "step": 87200 }, { "epoch": 5.1061589752588175, "grad_norm": 0.20766378939151764, "learning_rate": 0.001, "loss": 2.1338, "step": 87300 }, { "epoch": 5.11200795461192, "grad_norm": 0.2651847302913666, "learning_rate": 0.001, "loss": 2.1318, "step": 87400 }, { "epoch": 5.117856933965023, "grad_norm": 0.2402482032775879, "learning_rate": 0.001, "loss": 2.1368, "step": 87500 }, { "epoch": 5.123705913318126, "grad_norm": 0.23173804581165314, "learning_rate": 0.001, "loss": 2.1322, "step": 87600 }, { "epoch": 5.129554892671229, "grad_norm": 0.24230653047561646, "learning_rate": 0.001, "loss": 2.1376, "step": 87700 }, { "epoch": 5.135403872024332, "grad_norm": 0.210658997297287, "learning_rate": 0.001, "loss": 2.1302, "step": 87800 }, { "epoch": 5.141252851377435, "grad_norm": 0.24238458275794983, "learning_rate": 0.001, "loss": 2.1276, "step": 87900 }, { "epoch": 5.147101830730538, "grad_norm": 0.2622125744819641, "learning_rate": 0.001, "loss": 2.1268, "step": 88000 }, { "epoch": 5.152950810083641, "grad_norm": 0.2554051876068115, "learning_rate": 0.001, "loss": 2.1277, "step": 88100 }, { "epoch": 5.158799789436744, "grad_norm": 0.2502746284008026, "learning_rate": 0.001, "loss": 2.1293, "step": 88200 }, { "epoch": 5.164648768789847, "grad_norm": 0.2410629242658615, "learning_rate": 0.001, "loss": 2.1354, "step": 88300 }, { "epoch": 5.1704977481429495, "grad_norm": 0.17686983942985535, "learning_rate": 0.001, "loss": 2.1296, "step": 88400 }, { "epoch": 5.1763467274960515, "grad_norm": 0.25888848304748535, "learning_rate": 0.001, "loss": 2.1341, "step": 88500 }, { "epoch": 5.182195706849154, "grad_norm": 0.2561284303665161, "learning_rate": 0.001, "loss": 2.1297, "step": 88600 }, { "epoch": 5.188044686202257, "grad_norm": 0.2414897084236145, "learning_rate": 0.001, "loss": 2.1333, "step": 88700 }, { "epoch": 5.19389366555536, "grad_norm": 0.26673296093940735, "learning_rate": 0.001, "loss": 2.1287, "step": 88800 }, { "epoch": 5.199742644908463, "grad_norm": 0.2617727518081665, "learning_rate": 0.001, "loss": 2.1352, "step": 88900 }, { "epoch": 5.205591624261566, "grad_norm": 0.26240530610084534, "learning_rate": 0.001, "loss": 2.1276, "step": 89000 }, { "epoch": 5.211440603614669, "grad_norm": 0.2967888116836548, "learning_rate": 0.001, "loss": 2.1334, "step": 89100 }, { "epoch": 5.217289582967772, "grad_norm": 0.2241249829530716, "learning_rate": 0.001, "loss": 2.1351, "step": 89200 }, { "epoch": 5.223138562320875, "grad_norm": 0.3184453547000885, "learning_rate": 0.001, "loss": 2.1335, "step": 89300 }, { "epoch": 5.228987541673978, "grad_norm": 0.22987322509288788, "learning_rate": 0.001, "loss": 2.1294, "step": 89400 }, { "epoch": 5.234836521027081, "grad_norm": 0.29346805810928345, "learning_rate": 0.001, "loss": 2.1332, "step": 89500 }, { "epoch": 5.2406855003801835, "grad_norm": 0.2780662477016449, "learning_rate": 0.001, "loss": 2.1367, "step": 89600 }, { "epoch": 5.2465344797332865, "grad_norm": 0.2502575218677521, "learning_rate": 0.001, "loss": 2.1329, "step": 89700 }, { "epoch": 5.252383459086389, "grad_norm": 0.21832245588302612, "learning_rate": 0.001, "loss": 2.1282, "step": 89800 }, { "epoch": 5.258232438439492, "grad_norm": 0.2703295648097992, "learning_rate": 0.001, "loss": 2.1318, "step": 89900 }, { "epoch": 5.264081417792595, "grad_norm": 0.24012994766235352, "learning_rate": 0.001, "loss": 2.1328, "step": 90000 }, { "epoch": 5.269930397145698, "grad_norm": 0.31469500064849854, "learning_rate": 0.001, "loss": 2.1346, "step": 90100 }, { "epoch": 5.275779376498801, "grad_norm": 0.20914646983146667, "learning_rate": 0.001, "loss": 2.1301, "step": 90200 }, { "epoch": 5.281628355851904, "grad_norm": 0.20754943788051605, "learning_rate": 0.001, "loss": 2.1405, "step": 90300 }, { "epoch": 5.287477335205007, "grad_norm": 0.21026268601417542, "learning_rate": 0.001, "loss": 2.1279, "step": 90400 }, { "epoch": 5.29332631455811, "grad_norm": 0.2403227984905243, "learning_rate": 0.001, "loss": 2.1322, "step": 90500 }, { "epoch": 5.299175293911213, "grad_norm": 0.26532304286956787, "learning_rate": 0.001, "loss": 2.1335, "step": 90600 }, { "epoch": 5.305024273264316, "grad_norm": 0.19259081780910492, "learning_rate": 0.001, "loss": 2.1325, "step": 90700 }, { "epoch": 5.3108732526174185, "grad_norm": 0.3280028700828552, "learning_rate": 0.001, "loss": 2.1261, "step": 90800 }, { "epoch": 5.316722231970521, "grad_norm": 0.23025095462799072, "learning_rate": 0.001, "loss": 2.1336, "step": 90900 }, { "epoch": 5.322571211323624, "grad_norm": 0.25103139877319336, "learning_rate": 0.001, "loss": 2.1361, "step": 91000 }, { "epoch": 5.328420190676727, "grad_norm": 0.24228405952453613, "learning_rate": 0.001, "loss": 2.1308, "step": 91100 }, { "epoch": 5.33426917002983, "grad_norm": 0.2322128415107727, "learning_rate": 0.001, "loss": 2.131, "step": 91200 }, { "epoch": 5.340118149382933, "grad_norm": 0.2833353579044342, "learning_rate": 0.001, "loss": 2.1332, "step": 91300 }, { "epoch": 5.345967128736036, "grad_norm": 0.256968230009079, "learning_rate": 0.001, "loss": 2.1258, "step": 91400 }, { "epoch": 5.351816108089139, "grad_norm": 0.24114909768104553, "learning_rate": 0.001, "loss": 2.1242, "step": 91500 }, { "epoch": 5.357665087442241, "grad_norm": 0.29308924078941345, "learning_rate": 0.001, "loss": 2.134, "step": 91600 }, { "epoch": 5.363514066795344, "grad_norm": 0.21941666305065155, "learning_rate": 0.001, "loss": 2.1347, "step": 91700 }, { "epoch": 5.369363046148447, "grad_norm": 0.28234660625457764, "learning_rate": 0.001, "loss": 2.1313, "step": 91800 }, { "epoch": 5.37521202550155, "grad_norm": 0.278096467256546, "learning_rate": 0.001, "loss": 2.1277, "step": 91900 }, { "epoch": 5.3810610048546526, "grad_norm": 0.2523135542869568, "learning_rate": 0.001, "loss": 2.1271, "step": 92000 }, { "epoch": 5.3869099842077555, "grad_norm": 0.25298598408699036, "learning_rate": 0.001, "loss": 2.1393, "step": 92100 }, { "epoch": 5.392758963560858, "grad_norm": 0.2261531800031662, "learning_rate": 0.001, "loss": 2.1341, "step": 92200 }, { "epoch": 5.398607942913961, "grad_norm": 0.2041613608598709, "learning_rate": 0.001, "loss": 2.1327, "step": 92300 }, { "epoch": 5.404456922267064, "grad_norm": 0.3267274498939514, "learning_rate": 0.001, "loss": 2.1294, "step": 92400 }, { "epoch": 5.410305901620167, "grad_norm": 0.21766799688339233, "learning_rate": 0.001, "loss": 2.1297, "step": 92500 }, { "epoch": 5.41615488097327, "grad_norm": 0.21198849380016327, "learning_rate": 0.001, "loss": 2.1293, "step": 92600 }, { "epoch": 5.422003860326373, "grad_norm": 0.2589469850063324, "learning_rate": 0.001, "loss": 2.1248, "step": 92700 }, { "epoch": 5.427852839679476, "grad_norm": 0.242219477891922, "learning_rate": 0.001, "loss": 2.1317, "step": 92800 }, { "epoch": 5.433701819032579, "grad_norm": 0.16804341971874237, "learning_rate": 0.001, "loss": 2.1255, "step": 92900 }, { "epoch": 5.439550798385682, "grad_norm": 0.25967299938201904, "learning_rate": 0.001, "loss": 2.1244, "step": 93000 }, { "epoch": 5.445399777738785, "grad_norm": 0.2077724039554596, "learning_rate": 0.001, "loss": 2.1248, "step": 93100 }, { "epoch": 5.4512487570918875, "grad_norm": 0.22338294982910156, "learning_rate": 0.001, "loss": 2.1269, "step": 93200 }, { "epoch": 5.45709773644499, "grad_norm": 0.1851588785648346, "learning_rate": 0.001, "loss": 2.1206, "step": 93300 }, { "epoch": 5.462946715798093, "grad_norm": 0.21415391564369202, "learning_rate": 0.001, "loss": 2.1263, "step": 93400 }, { "epoch": 5.468795695151196, "grad_norm": 0.23884521424770355, "learning_rate": 0.001, "loss": 2.1262, "step": 93500 }, { "epoch": 5.474644674504299, "grad_norm": 0.21942757070064545, "learning_rate": 0.001, "loss": 2.123, "step": 93600 }, { "epoch": 5.480493653857402, "grad_norm": 0.19063684344291687, "learning_rate": 0.001, "loss": 2.1283, "step": 93700 }, { "epoch": 5.486342633210505, "grad_norm": 0.2388858050107956, "learning_rate": 0.001, "loss": 2.1297, "step": 93800 }, { "epoch": 5.492191612563608, "grad_norm": 0.28790104389190674, "learning_rate": 0.001, "loss": 2.1345, "step": 93900 }, { "epoch": 5.498040591916711, "grad_norm": 0.1930641233921051, "learning_rate": 0.001, "loss": 2.1243, "step": 94000 }, { "epoch": 5.503889571269814, "grad_norm": 0.22388367354869843, "learning_rate": 0.001, "loss": 2.1307, "step": 94100 }, { "epoch": 5.509738550622917, "grad_norm": 0.3048474192619324, "learning_rate": 0.001, "loss": 2.1227, "step": 94200 }, { "epoch": 5.5155875299760195, "grad_norm": 0.19809292256832123, "learning_rate": 0.001, "loss": 2.1278, "step": 94300 }, { "epoch": 5.521436509329122, "grad_norm": 0.24796044826507568, "learning_rate": 0.001, "loss": 2.1278, "step": 94400 }, { "epoch": 5.5272854886822245, "grad_norm": 0.2055673599243164, "learning_rate": 0.001, "loss": 2.1257, "step": 94500 }, { "epoch": 5.533134468035328, "grad_norm": 0.2162240743637085, "learning_rate": 0.001, "loss": 2.1291, "step": 94600 }, { "epoch": 5.53898344738843, "grad_norm": 0.24430686235427856, "learning_rate": 0.001, "loss": 2.123, "step": 94700 }, { "epoch": 5.544832426741534, "grad_norm": 0.2748570740222931, "learning_rate": 0.001, "loss": 2.125, "step": 94800 }, { "epoch": 5.550681406094636, "grad_norm": 0.2242583930492401, "learning_rate": 0.001, "loss": 2.1277, "step": 94900 }, { "epoch": 5.556530385447739, "grad_norm": 0.2271336317062378, "learning_rate": 0.001, "loss": 2.1243, "step": 95000 }, { "epoch": 5.562379364800842, "grad_norm": 0.2206297665834427, "learning_rate": 0.001, "loss": 2.1263, "step": 95100 }, { "epoch": 5.568228344153945, "grad_norm": 0.2993350923061371, "learning_rate": 0.001, "loss": 2.1341, "step": 95200 }, { "epoch": 5.574077323507048, "grad_norm": 0.20289494097232819, "learning_rate": 0.001, "loss": 2.126, "step": 95300 }, { "epoch": 5.579926302860151, "grad_norm": 0.19539141654968262, "learning_rate": 0.001, "loss": 2.1253, "step": 95400 }, { "epoch": 5.585775282213254, "grad_norm": 0.21908316016197205, "learning_rate": 0.001, "loss": 2.1272, "step": 95500 }, { "epoch": 5.5916242615663565, "grad_norm": 0.2629600763320923, "learning_rate": 0.001, "loss": 2.1312, "step": 95600 }, { "epoch": 5.597473240919459, "grad_norm": 0.27116721868515015, "learning_rate": 0.001, "loss": 2.1277, "step": 95700 }, { "epoch": 5.603322220272562, "grad_norm": 0.22592175006866455, "learning_rate": 0.001, "loss": 2.1221, "step": 95800 }, { "epoch": 5.609171199625665, "grad_norm": 0.23095929622650146, "learning_rate": 0.001, "loss": 2.124, "step": 95900 }, { "epoch": 5.615020178978768, "grad_norm": 0.2633673846721649, "learning_rate": 0.001, "loss": 2.1259, "step": 96000 }, { "epoch": 5.620869158331871, "grad_norm": 0.19212281703948975, "learning_rate": 0.001, "loss": 2.127, "step": 96100 }, { "epoch": 5.626718137684974, "grad_norm": 0.190889373421669, "learning_rate": 0.001, "loss": 2.1222, "step": 96200 }, { "epoch": 5.632567117038077, "grad_norm": 0.2655491828918457, "learning_rate": 0.001, "loss": 2.1186, "step": 96300 }, { "epoch": 5.63841609639118, "grad_norm": 0.2967199981212616, "learning_rate": 0.001, "loss": 2.1237, "step": 96400 }, { "epoch": 5.644265075744283, "grad_norm": 0.23788392543792725, "learning_rate": 0.001, "loss": 2.1292, "step": 96500 }, { "epoch": 5.650114055097386, "grad_norm": 0.20494572818279266, "learning_rate": 0.001, "loss": 2.1289, "step": 96600 }, { "epoch": 5.6559630344504885, "grad_norm": 0.22712619602680206, "learning_rate": 0.001, "loss": 2.1198, "step": 96700 }, { "epoch": 5.6618120138035914, "grad_norm": 0.20922763645648956, "learning_rate": 0.001, "loss": 2.1189, "step": 96800 }, { "epoch": 5.667660993156694, "grad_norm": 0.28417500853538513, "learning_rate": 0.001, "loss": 2.1214, "step": 96900 }, { "epoch": 5.673509972509797, "grad_norm": 0.26461485028266907, "learning_rate": 0.001, "loss": 2.124, "step": 97000 }, { "epoch": 5.6793589518629, "grad_norm": 0.2717529833316803, "learning_rate": 0.001, "loss": 2.1256, "step": 97100 }, { "epoch": 5.685207931216003, "grad_norm": 0.2622258961200714, "learning_rate": 0.001, "loss": 2.1284, "step": 97200 }, { "epoch": 5.691056910569106, "grad_norm": 0.2981286942958832, "learning_rate": 0.001, "loss": 2.1242, "step": 97300 }, { "epoch": 5.696905889922209, "grad_norm": 0.2482168972492218, "learning_rate": 0.001, "loss": 2.1263, "step": 97400 }, { "epoch": 5.702754869275312, "grad_norm": 0.23561882972717285, "learning_rate": 0.001, "loss": 2.1227, "step": 97500 }, { "epoch": 5.708603848628414, "grad_norm": 0.2643316686153412, "learning_rate": 0.001, "loss": 2.1244, "step": 97600 }, { "epoch": 5.714452827981518, "grad_norm": 0.2623312175273895, "learning_rate": 0.001, "loss": 2.1188, "step": 97700 }, { "epoch": 5.72030180733462, "grad_norm": 0.20033173263072968, "learning_rate": 0.001, "loss": 2.1271, "step": 97800 }, { "epoch": 5.7261507866877235, "grad_norm": 0.2647962272167206, "learning_rate": 0.001, "loss": 2.1217, "step": 97900 }, { "epoch": 5.7319997660408255, "grad_norm": 0.2878161668777466, "learning_rate": 0.001, "loss": 2.1172, "step": 98000 }, { "epoch": 5.737848745393928, "grad_norm": 0.29704809188842773, "learning_rate": 0.001, "loss": 2.1216, "step": 98100 }, { "epoch": 5.743697724747031, "grad_norm": 0.20811361074447632, "learning_rate": 0.001, "loss": 2.1215, "step": 98200 }, { "epoch": 5.749546704100134, "grad_norm": 0.18814870715141296, "learning_rate": 0.001, "loss": 2.1184, "step": 98300 }, { "epoch": 5.755395683453237, "grad_norm": 0.2864201068878174, "learning_rate": 0.001, "loss": 2.122, "step": 98400 }, { "epoch": 5.76124466280634, "grad_norm": 0.29148194193840027, "learning_rate": 0.001, "loss": 2.1201, "step": 98500 }, { "epoch": 5.767093642159443, "grad_norm": 0.2907048165798187, "learning_rate": 0.001, "loss": 2.1228, "step": 98600 }, { "epoch": 5.772942621512546, "grad_norm": 0.21391929686069489, "learning_rate": 0.001, "loss": 2.1242, "step": 98700 }, { "epoch": 5.778791600865649, "grad_norm": 0.2575721740722656, "learning_rate": 0.001, "loss": 2.119, "step": 98800 }, { "epoch": 5.784640580218752, "grad_norm": 0.258595734834671, "learning_rate": 0.001, "loss": 2.1246, "step": 98900 }, { "epoch": 5.790489559571855, "grad_norm": 0.26843732595443726, "learning_rate": 0.001, "loss": 2.1214, "step": 99000 }, { "epoch": 5.7963385389249575, "grad_norm": 0.21987313032150269, "learning_rate": 0.001, "loss": 2.122, "step": 99100 }, { "epoch": 5.8021875182780605, "grad_norm": 0.20566058158874512, "learning_rate": 0.001, "loss": 2.1132, "step": 99200 }, { "epoch": 5.808036497631163, "grad_norm": 0.23245152831077576, "learning_rate": 0.001, "loss": 2.1145, "step": 99300 }, { "epoch": 5.813885476984266, "grad_norm": 0.14448973536491394, "learning_rate": 0.001, "loss": 2.1247, "step": 99400 }, { "epoch": 5.819734456337369, "grad_norm": 0.1648184061050415, "learning_rate": 0.001, "loss": 2.1226, "step": 99500 }, { "epoch": 5.825583435690472, "grad_norm": 0.27770861983299255, "learning_rate": 0.001, "loss": 2.1228, "step": 99600 }, { "epoch": 5.831432415043575, "grad_norm": 0.2791621685028076, "learning_rate": 0.001, "loss": 2.1244, "step": 99700 }, { "epoch": 5.837281394396678, "grad_norm": 0.2150287926197052, "learning_rate": 0.001, "loss": 2.1278, "step": 99800 }, { "epoch": 5.843130373749781, "grad_norm": 0.21594169735908508, "learning_rate": 0.001, "loss": 2.1171, "step": 99900 }, { "epoch": 5.848979353102884, "grad_norm": 0.20374539494514465, "learning_rate": 0.001, "loss": 2.1265, "step": 100000 }, { "epoch": 5.848979353102884, "eval_ag_news_accuracy": 0.2285625, "eval_ag_news_bleu_score": 5.956263575457368, "eval_ag_news_bleu_score_sem": 0.42907333928913105, "eval_ag_news_emb_cos_sim": 0.6494214534759521, "eval_ag_news_emb_cos_sim_sem": 0.015933819115161896, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.880404233932495, "eval_ag_news_n_ngrams_match_1": 12.6328125, "eval_ag_news_n_ngrams_match_2": 3.3203125, "eval_ag_news_n_ngrams_match_3": 1.265625, "eval_ag_news_num_pred_words": 48.625, "eval_ag_news_num_true_words": 45.328125, "eval_ag_news_perplexity": 17.82147576898157, "eval_ag_news_pred_num_tokens": 74.46875, "eval_ag_news_rouge_score": 0.251844132649264, "eval_ag_news_runtime": 38.5247, "eval_ag_news_samples_per_second": 12.979, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.29470902482326, "eval_ag_news_token_set_f1_sem": 0.00903580954619659, "eval_ag_news_token_set_precision": 0.2774672987587278, "eval_ag_news_token_set_recall": 0.32572948249829137, "eval_ag_news_true_num_tokens": 63.8515625, "step": 100000 }, { "epoch": 5.848979353102884, "eval_anthropic_toxic_prompts_accuracy": 0.098453125, "eval_anthropic_toxic_prompts_bleu_score": 32.59288647763104, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.3786169608906422, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8389565348625183, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.013341598212718964, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.0859375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02487009494926453, "eval_anthropic_toxic_prompts_loss": 1.3925126791000366, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.4765625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 4.7734375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 2.84375, "eval_anthropic_toxic_prompts_num_pred_words": 16.78125, "eval_anthropic_toxic_prompts_num_true_words": 14.5859375, "eval_anthropic_toxic_prompts_perplexity": 4.024950767424879, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.703125, "eval_anthropic_toxic_prompts_rouge_score": 0.6033243212859363, "eval_anthropic_toxic_prompts_runtime": 31.5405, "eval_anthropic_toxic_prompts_samples_per_second": 15.853, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.6296213228005558, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01868574895084192, "eval_anthropic_toxic_prompts_token_set_precision": 0.6316372317852839, "eval_anthropic_toxic_prompts_token_set_recall": 0.636493408722366, "eval_anthropic_toxic_prompts_true_num_tokens": 17.8203125, "step": 100000 }, { "epoch": 5.848979353102884, "eval_arxiv_accuracy": 0.360875, "eval_arxiv_bleu_score": 1.1239591496630477, "eval_arxiv_bleu_score_sem": 0.09092879766687495, "eval_arxiv_emb_cos_sim": 0.35496360063552856, "eval_arxiv_emb_cos_sim_sem": 0.016506526619195938, "eval_arxiv_emb_top1_equal": 0.8046875, "eval_arxiv_emb_top1_equal_sem": 0.03517845645546913, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.5403079986572266, "eval_arxiv_n_ngrams_match_1": 10.5390625, "eval_arxiv_n_ngrams_match_2": 1.6171875, "eval_arxiv_n_ngrams_match_3": 0.1796875, "eval_arxiv_num_pred_words": 48.1015625, "eval_arxiv_num_true_words": 85.734375, "eval_arxiv_perplexity": 34.47753659067542, "eval_arxiv_pred_num_tokens": 126.6640625, "eval_arxiv_rouge_score": 0.14361074987790107, "eval_arxiv_runtime": 32.3336, "eval_arxiv_samples_per_second": 15.464, "eval_arxiv_steps_per_second": 0.031, "eval_arxiv_token_set_f1": 0.1354912573171644, "eval_arxiv_token_set_f1_sem": 0.007801937681062606, "eval_arxiv_token_set_precision": 0.08800422132310755, "eval_arxiv_token_set_recall": 0.3753875953022701, "eval_arxiv_true_num_tokens": 125.203125, "step": 100000 }, { "epoch": 5.848979353102884, "eval_python_code_alpaca_accuracy": 0.12090625, "eval_python_code_alpaca_bleu_score": 22.79481862892887, "eval_python_code_alpaca_bleu_score_sem": 1.425677524422041, "eval_python_code_alpaca_emb_cos_sim": 0.8308481574058533, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011956846341490746, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.6361662149429321, "eval_python_code_alpaca_n_ngrams_match_1": 9.921875, "eval_python_code_alpaca_n_ngrams_match_2": 4.921875, "eval_python_code_alpaca_n_ngrams_match_3": 2.609375, "eval_python_code_alpaca_num_pred_words": 18.015625, "eval_python_code_alpaca_num_true_words": 18.9296875, "eval_python_code_alpaca_perplexity": 5.1354435371272, "eval_python_code_alpaca_pred_num_tokens": 24.515625, "eval_python_code_alpaca_rouge_score": 0.5476341120817995, "eval_python_code_alpaca_runtime": 30.2629, "eval_python_code_alpaca_samples_per_second": 16.522, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.5711811480963738, "eval_python_code_alpaca_token_set_f1_sem": 0.014233072242307855, "eval_python_code_alpaca_token_set_precision": 0.5542038017427533, "eval_python_code_alpaca_token_set_recall": 0.596649489099905, "eval_python_code_alpaca_true_num_tokens": 25.125, "step": 100000 }, { "epoch": 5.848979353102884, "eval_wikibio_accuracy": 0.35853125, "eval_wikibio_bleu_score": 6.361242183815891, "eval_wikibio_bleu_score_sem": 0.7021315573881856, "eval_wikibio_emb_cos_sim": 0.5128795504570007, "eval_wikibio_emb_cos_sim_sem": 0.024146465584635735, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.856553077697754, "eval_wikibio_n_ngrams_match_1": 13.0703125, "eval_wikibio_n_ngrams_match_2": 4.109375, "eval_wikibio_n_ngrams_match_3": 1.71875, "eval_wikibio_num_pred_words": 52.6640625, "eval_wikibio_num_true_words": 51.140625, "eval_wikibio_perplexity": 17.40144202345499, "eval_wikibio_pred_num_tokens": 106.71875, "eval_wikibio_rouge_score": 0.2448142761897899, "eval_wikibio_runtime": 30.4992, "eval_wikibio_samples_per_second": 16.394, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.27734481216224754, "eval_wikibio_token_set_f1_sem": 0.01505811631117952, "eval_wikibio_token_set_precision": 0.24511003847967097, "eval_wikibio_token_set_recall": 0.3524441940394523, "eval_wikibio_true_num_tokens": 99.09375, "step": 100000 }, { "epoch": 5.848979353102884, "eval_msmarco_accuracy": 0.364, "eval_msmarco_bleu_score": 12.991299839808892, "eval_msmarco_bleu_score_sem": 1.1204123182180068, "eval_msmarco_emb_cos_sim": 0.7127721309661865, "eval_msmarco_emb_cos_sim_sem": 0.018786394968628883, "eval_msmarco_emb_top1_equal": 0.90625, "eval_msmarco_emb_top1_equal_sem": 0.025864720344543457, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.9959454536437988, "eval_msmarco_n_ngrams_match_1": 24.9765625, "eval_msmarco_n_ngrams_match_2": 10.328125, "eval_msmarco_n_ngrams_match_3": 5.46875, "eval_msmarco_num_pred_words": 62.4765625, "eval_msmarco_num_true_words": 62.0546875, "eval_msmarco_perplexity": 7.359157482072448, "eval_msmarco_pred_num_tokens": 88.6796875, "eval_msmarco_rouge_score": 0.3862391072170958, "eval_msmarco_runtime": 25.9314, "eval_msmarco_samples_per_second": 19.282, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4216933949025148, "eval_msmarco_token_set_f1_sem": 0.013295957290270624, "eval_msmarco_token_set_precision": 0.37878183544415756, "eval_msmarco_token_set_recall": 0.5049543333176495, "eval_msmarco_true_num_tokens": 80.203125, "step": 100000 }, { "epoch": 5.854828332455987, "grad_norm": 0.2574097812175751, "learning_rate": 0.001, "loss": 2.1202, "step": 100100 }, { "epoch": 5.86067731180909, "grad_norm": 0.2339394986629486, "learning_rate": 0.001, "loss": 2.1179, "step": 100200 }, { "epoch": 5.8665262911621925, "grad_norm": 0.2739557921886444, "learning_rate": 0.001, "loss": 2.1186, "step": 100300 }, { "epoch": 5.872375270515295, "grad_norm": 0.25309568643569946, "learning_rate": 0.001, "loss": 2.1147, "step": 100400 }, { "epoch": 5.878224249868398, "grad_norm": 0.2515398859977722, "learning_rate": 0.001, "loss": 2.1182, "step": 100500 }, { "epoch": 5.884073229221501, "grad_norm": 0.2506633996963501, "learning_rate": 0.001, "loss": 2.1165, "step": 100600 }, { "epoch": 5.889922208574603, "grad_norm": 0.26542747020721436, "learning_rate": 0.001, "loss": 2.1171, "step": 100700 }, { "epoch": 5.895771187927707, "grad_norm": 0.27050501108169556, "learning_rate": 0.001, "loss": 2.1131, "step": 100800 }, { "epoch": 5.901620167280809, "grad_norm": 0.2435375303030014, "learning_rate": 0.001, "loss": 2.1195, "step": 100900 }, { "epoch": 5.907469146633913, "grad_norm": 0.34703242778778076, "learning_rate": 0.001, "loss": 2.1242, "step": 101000 }, { "epoch": 5.913318125987015, "grad_norm": 0.26332414150238037, "learning_rate": 0.001, "loss": 2.1215, "step": 101100 }, { "epoch": 5.919167105340118, "grad_norm": 0.25666889548301697, "learning_rate": 0.001, "loss": 2.1109, "step": 101200 }, { "epoch": 5.925016084693221, "grad_norm": 0.31282225251197815, "learning_rate": 0.001, "loss": 2.1191, "step": 101300 }, { "epoch": 5.930865064046324, "grad_norm": 0.3243866562843323, "learning_rate": 0.001, "loss": 2.1207, "step": 101400 }, { "epoch": 5.9367140433994265, "grad_norm": 0.24442337453365326, "learning_rate": 0.001, "loss": 2.1146, "step": 101500 }, { "epoch": 5.9425630227525295, "grad_norm": 0.2608204185962677, "learning_rate": 0.001, "loss": 2.1183, "step": 101600 }, { "epoch": 5.948412002105632, "grad_norm": 0.2879640758037567, "learning_rate": 0.001, "loss": 2.1093, "step": 101700 }, { "epoch": 5.954260981458735, "grad_norm": 0.27608925104141235, "learning_rate": 0.001, "loss": 2.1143, "step": 101800 }, { "epoch": 5.960109960811838, "grad_norm": 0.2489030808210373, "learning_rate": 0.001, "loss": 2.1223, "step": 101900 }, { "epoch": 5.965958940164941, "grad_norm": 0.2895537316799164, "learning_rate": 0.001, "loss": 2.1117, "step": 102000 }, { "epoch": 5.971807919518044, "grad_norm": 0.20669177174568176, "learning_rate": 0.001, "loss": 2.1141, "step": 102100 }, { "epoch": 5.977656898871147, "grad_norm": 0.2552451193332672, "learning_rate": 0.001, "loss": 2.1158, "step": 102200 }, { "epoch": 5.98350587822425, "grad_norm": 0.2305663377046585, "learning_rate": 0.001, "loss": 2.1157, "step": 102300 }, { "epoch": 5.989354857577353, "grad_norm": 0.2048824429512024, "learning_rate": 0.001, "loss": 2.1132, "step": 102400 }, { "epoch": 5.995203836930456, "grad_norm": 0.26770251989364624, "learning_rate": 0.001, "loss": 2.1109, "step": 102500 }, { "epoch": 6.001052816283559, "grad_norm": 0.23777908086776733, "learning_rate": 0.001, "loss": 2.1194, "step": 102600 }, { "epoch": 6.0069017956366615, "grad_norm": 0.21314191818237305, "learning_rate": 0.001, "loss": 2.1021, "step": 102700 }, { "epoch": 6.012750774989764, "grad_norm": 0.30374109745025635, "learning_rate": 0.001, "loss": 2.102, "step": 102800 }, { "epoch": 6.018599754342867, "grad_norm": 0.23842616379261017, "learning_rate": 0.001, "loss": 2.1024, "step": 102900 }, { "epoch": 6.02444873369597, "grad_norm": 0.24678194522857666, "learning_rate": 0.001, "loss": 2.0987, "step": 103000 }, { "epoch": 6.030297713049073, "grad_norm": 0.29376375675201416, "learning_rate": 0.001, "loss": 2.1015, "step": 103100 }, { "epoch": 6.036146692402176, "grad_norm": 0.22300441563129425, "learning_rate": 0.001, "loss": 2.0989, "step": 103200 }, { "epoch": 6.041995671755279, "grad_norm": 0.2978113889694214, "learning_rate": 0.001, "loss": 2.0969, "step": 103300 }, { "epoch": 6.047844651108382, "grad_norm": 0.27880674600601196, "learning_rate": 0.001, "loss": 2.0972, "step": 103400 }, { "epoch": 6.053693630461485, "grad_norm": 0.2210063338279724, "learning_rate": 0.001, "loss": 2.1075, "step": 103500 }, { "epoch": 6.059542609814588, "grad_norm": 0.3057677447795868, "learning_rate": 0.001, "loss": 2.0983, "step": 103600 }, { "epoch": 6.065391589167691, "grad_norm": 0.2811333239078522, "learning_rate": 0.001, "loss": 2.1054, "step": 103700 }, { "epoch": 6.0712405685207935, "grad_norm": 0.2313440889120102, "learning_rate": 0.001, "loss": 2.1004, "step": 103800 }, { "epoch": 6.077089547873896, "grad_norm": 0.23024940490722656, "learning_rate": 0.001, "loss": 2.0965, "step": 103900 }, { "epoch": 6.0829385272269985, "grad_norm": 0.2158385068178177, "learning_rate": 0.001, "loss": 2.0969, "step": 104000 }, { "epoch": 6.088787506580101, "grad_norm": 0.2841322720050812, "learning_rate": 0.001, "loss": 2.1011, "step": 104100 }, { "epoch": 6.094636485933204, "grad_norm": 0.262212872505188, "learning_rate": 0.001, "loss": 2.1041, "step": 104200 }, { "epoch": 6.100485465286307, "grad_norm": 0.28073957562446594, "learning_rate": 0.001, "loss": 2.1001, "step": 104300 }, { "epoch": 6.10633444463941, "grad_norm": 0.26083070039749146, "learning_rate": 0.001, "loss": 2.1017, "step": 104400 }, { "epoch": 6.112183423992513, "grad_norm": 0.23963218927383423, "learning_rate": 0.001, "loss": 2.1022, "step": 104500 }, { "epoch": 6.118032403345616, "grad_norm": 0.24107223749160767, "learning_rate": 0.001, "loss": 2.097, "step": 104600 }, { "epoch": 6.123881382698719, "grad_norm": 0.24096304178237915, "learning_rate": 0.001, "loss": 2.1019, "step": 104700 }, { "epoch": 6.129730362051822, "grad_norm": 0.37832820415496826, "learning_rate": 0.001, "loss": 2.1034, "step": 104800 }, { "epoch": 6.135579341404925, "grad_norm": 0.2585601508617401, "learning_rate": 0.001, "loss": 2.0999, "step": 104900 }, { "epoch": 6.141428320758028, "grad_norm": 0.28771835565567017, "learning_rate": 0.001, "loss": 2.0971, "step": 105000 }, { "epoch": 6.1472773001111305, "grad_norm": 0.3166847229003906, "learning_rate": 0.001, "loss": 2.0992, "step": 105100 }, { "epoch": 6.153126279464233, "grad_norm": 0.24630868434906006, "learning_rate": 0.001, "loss": 2.1008, "step": 105200 }, { "epoch": 6.158975258817336, "grad_norm": 0.2847469449043274, "learning_rate": 0.001, "loss": 2.1032, "step": 105300 }, { "epoch": 6.164824238170439, "grad_norm": 0.229908287525177, "learning_rate": 0.001, "loss": 2.1037, "step": 105400 }, { "epoch": 6.170673217523542, "grad_norm": 0.2510509192943573, "learning_rate": 0.001, "loss": 2.0964, "step": 105500 }, { "epoch": 6.176522196876645, "grad_norm": 0.22734995186328888, "learning_rate": 0.001, "loss": 2.098, "step": 105600 }, { "epoch": 6.182371176229748, "grad_norm": 0.28731751441955566, "learning_rate": 0.001, "loss": 2.1007, "step": 105700 }, { "epoch": 6.188220155582851, "grad_norm": 0.3047807514667511, "learning_rate": 0.001, "loss": 2.0998, "step": 105800 }, { "epoch": 6.194069134935954, "grad_norm": 0.3337942361831665, "learning_rate": 0.001, "loss": 2.0976, "step": 105900 }, { "epoch": 6.199918114289057, "grad_norm": 0.3040502071380615, "learning_rate": 0.001, "loss": 2.0976, "step": 106000 }, { "epoch": 6.20576709364216, "grad_norm": 0.2632260322570801, "learning_rate": 0.001, "loss": 2.1004, "step": 106100 }, { "epoch": 6.2116160729952625, "grad_norm": 0.21615566313266754, "learning_rate": 0.001, "loss": 2.0983, "step": 106200 }, { "epoch": 6.217465052348365, "grad_norm": 0.23393277823925018, "learning_rate": 0.001, "loss": 2.0955, "step": 106300 }, { "epoch": 6.223314031701468, "grad_norm": 0.31773945689201355, "learning_rate": 0.001, "loss": 2.0994, "step": 106400 }, { "epoch": 6.229163011054571, "grad_norm": 0.27450287342071533, "learning_rate": 0.001, "loss": 2.1064, "step": 106500 }, { "epoch": 6.235011990407674, "grad_norm": 0.3135039210319519, "learning_rate": 0.001, "loss": 2.1053, "step": 106600 }, { "epoch": 6.240860969760777, "grad_norm": 0.2548730671405792, "learning_rate": 0.001, "loss": 2.1034, "step": 106700 }, { "epoch": 6.24670994911388, "grad_norm": 0.21735608577728271, "learning_rate": 0.001, "loss": 2.1001, "step": 106800 }, { "epoch": 6.252558928466983, "grad_norm": 0.2528938353061676, "learning_rate": 0.001, "loss": 2.099, "step": 106900 }, { "epoch": 6.258407907820086, "grad_norm": 0.27050113677978516, "learning_rate": 0.001, "loss": 2.1012, "step": 107000 }, { "epoch": 6.264256887173188, "grad_norm": 0.26338350772857666, "learning_rate": 0.001, "loss": 2.1097, "step": 107100 }, { "epoch": 6.270105866526291, "grad_norm": 0.289813756942749, "learning_rate": 0.001, "loss": 2.1015, "step": 107200 }, { "epoch": 6.275954845879394, "grad_norm": 0.34191712737083435, "learning_rate": 0.001, "loss": 2.1026, "step": 107300 }, { "epoch": 6.281803825232497, "grad_norm": 0.27276644110679626, "learning_rate": 0.001, "loss": 2.0987, "step": 107400 }, { "epoch": 6.2876528045855995, "grad_norm": 0.23259571194648743, "learning_rate": 0.001, "loss": 2.1034, "step": 107500 }, { "epoch": 6.293501783938702, "grad_norm": 0.2447083592414856, "learning_rate": 0.001, "loss": 2.0948, "step": 107600 }, { "epoch": 6.299350763291805, "grad_norm": 0.2040775567293167, "learning_rate": 0.001, "loss": 2.0993, "step": 107700 }, { "epoch": 6.305199742644908, "grad_norm": 0.2781218886375427, "learning_rate": 0.001, "loss": 2.1, "step": 107800 }, { "epoch": 6.311048721998011, "grad_norm": 0.22619770467281342, "learning_rate": 0.001, "loss": 2.0995, "step": 107900 }, { "epoch": 6.316897701351114, "grad_norm": 0.2240651249885559, "learning_rate": 0.001, "loss": 2.0976, "step": 108000 }, { "epoch": 6.322746680704217, "grad_norm": 0.19287905097007751, "learning_rate": 0.001, "loss": 2.0996, "step": 108100 }, { "epoch": 6.32859566005732, "grad_norm": 0.23125314712524414, "learning_rate": 0.001, "loss": 2.1058, "step": 108200 }, { "epoch": 6.334444639410423, "grad_norm": 0.2538068890571594, "learning_rate": 0.001, "loss": 2.1009, "step": 108300 }, { "epoch": 6.340293618763526, "grad_norm": 0.2353450208902359, "learning_rate": 0.001, "loss": 2.0921, "step": 108400 }, { "epoch": 6.346142598116629, "grad_norm": 0.17340697348117828, "learning_rate": 0.001, "loss": 2.1015, "step": 108500 }, { "epoch": 6.3519915774697315, "grad_norm": 0.32993075251579285, "learning_rate": 0.001, "loss": 2.0987, "step": 108600 }, { "epoch": 6.357840556822834, "grad_norm": 0.31950458884239197, "learning_rate": 0.001, "loss": 2.1058, "step": 108700 }, { "epoch": 6.363689536175937, "grad_norm": 0.22938083112239838, "learning_rate": 0.001, "loss": 2.0982, "step": 108800 }, { "epoch": 6.36953851552904, "grad_norm": 0.29100000858306885, "learning_rate": 0.001, "loss": 2.0959, "step": 108900 }, { "epoch": 6.375387494882143, "grad_norm": 0.30973610281944275, "learning_rate": 0.001, "loss": 2.1009, "step": 109000 }, { "epoch": 6.381236474235246, "grad_norm": 0.3198472261428833, "learning_rate": 0.001, "loss": 2.1014, "step": 109100 }, { "epoch": 6.387085453588349, "grad_norm": 0.2357063591480255, "learning_rate": 0.001, "loss": 2.098, "step": 109200 }, { "epoch": 6.392934432941452, "grad_norm": 0.2664979100227356, "learning_rate": 0.001, "loss": 2.094, "step": 109300 }, { "epoch": 6.398783412294555, "grad_norm": 0.28255847096443176, "learning_rate": 0.001, "loss": 2.093, "step": 109400 }, { "epoch": 6.404632391647658, "grad_norm": 0.2241983711719513, "learning_rate": 0.001, "loss": 2.1005, "step": 109500 }, { "epoch": 6.410481371000761, "grad_norm": 0.29139211773872375, "learning_rate": 0.001, "loss": 2.1051, "step": 109600 }, { "epoch": 6.416330350353864, "grad_norm": 0.2747756242752075, "learning_rate": 0.001, "loss": 2.1034, "step": 109700 }, { "epoch": 6.4221793297069665, "grad_norm": 0.16917309165000916, "learning_rate": 0.001, "loss": 2.0958, "step": 109800 }, { "epoch": 6.428028309060069, "grad_norm": 0.3368980288505554, "learning_rate": 0.001, "loss": 2.0961, "step": 109900 }, { "epoch": 6.433877288413172, "grad_norm": 0.24445271492004395, "learning_rate": 0.001, "loss": 2.0952, "step": 110000 }, { "epoch": 6.439726267766275, "grad_norm": 0.24382413923740387, "learning_rate": 0.001, "loss": 2.1, "step": 110100 }, { "epoch": 6.445575247119377, "grad_norm": 0.24839873611927032, "learning_rate": 0.001, "loss": 2.0984, "step": 110200 }, { "epoch": 6.45142422647248, "grad_norm": 0.3203531503677368, "learning_rate": 0.001, "loss": 2.0934, "step": 110300 }, { "epoch": 6.457273205825583, "grad_norm": 0.26838013529777527, "learning_rate": 0.001, "loss": 2.0949, "step": 110400 }, { "epoch": 6.463122185178686, "grad_norm": 0.22646059095859528, "learning_rate": 0.001, "loss": 2.0982, "step": 110500 }, { "epoch": 6.468971164531789, "grad_norm": 0.2514132857322693, "learning_rate": 0.001, "loss": 2.0962, "step": 110600 }, { "epoch": 6.474820143884892, "grad_norm": 0.3012280762195587, "learning_rate": 0.001, "loss": 2.0974, "step": 110700 }, { "epoch": 6.480669123237995, "grad_norm": 0.23975202441215515, "learning_rate": 0.001, "loss": 2.095, "step": 110800 }, { "epoch": 6.486518102591098, "grad_norm": 0.24384506046772003, "learning_rate": 0.001, "loss": 2.0989, "step": 110900 }, { "epoch": 6.4923670819442005, "grad_norm": 0.31055548787117004, "learning_rate": 0.001, "loss": 2.097, "step": 111000 }, { "epoch": 6.4982160612973034, "grad_norm": 0.20272226631641388, "learning_rate": 0.001, "loss": 2.0925, "step": 111100 }, { "epoch": 6.504065040650406, "grad_norm": 0.25046879053115845, "learning_rate": 0.001, "loss": 2.098, "step": 111200 }, { "epoch": 6.509914020003509, "grad_norm": 0.24223729968070984, "learning_rate": 0.001, "loss": 2.1017, "step": 111300 }, { "epoch": 6.515762999356612, "grad_norm": 0.223446324467659, "learning_rate": 0.001, "loss": 2.0967, "step": 111400 }, { "epoch": 6.521611978709715, "grad_norm": 0.30099615454673767, "learning_rate": 0.001, "loss": 2.0955, "step": 111500 }, { "epoch": 6.527460958062818, "grad_norm": 0.2880154252052307, "learning_rate": 0.001, "loss": 2.0999, "step": 111600 }, { "epoch": 6.533309937415921, "grad_norm": 0.22631791234016418, "learning_rate": 0.001, "loss": 2.0987, "step": 111700 }, { "epoch": 6.539158916769024, "grad_norm": 0.220871239900589, "learning_rate": 0.001, "loss": 2.0915, "step": 111800 }, { "epoch": 6.545007896122127, "grad_norm": 0.2844788730144501, "learning_rate": 0.001, "loss": 2.0947, "step": 111900 }, { "epoch": 6.55085687547523, "grad_norm": 0.3112492561340332, "learning_rate": 0.001, "loss": 2.0943, "step": 112000 }, { "epoch": 6.556705854828333, "grad_norm": 0.26832643151283264, "learning_rate": 0.001, "loss": 2.0894, "step": 112100 }, { "epoch": 6.5625548341814355, "grad_norm": 0.2995191514492035, "learning_rate": 0.001, "loss": 2.0977, "step": 112200 }, { "epoch": 6.568403813534538, "grad_norm": 0.3004911541938782, "learning_rate": 0.001, "loss": 2.0994, "step": 112300 }, { "epoch": 6.574252792887641, "grad_norm": 0.26033711433410645, "learning_rate": 0.001, "loss": 2.0952, "step": 112400 }, { "epoch": 6.580101772240744, "grad_norm": 0.2631012201309204, "learning_rate": 0.001, "loss": 2.0896, "step": 112500 }, { "epoch": 6.585950751593847, "grad_norm": 0.24537527561187744, "learning_rate": 0.001, "loss": 2.1032, "step": 112600 }, { "epoch": 6.59179973094695, "grad_norm": 0.2032749503850937, "learning_rate": 0.001, "loss": 2.0953, "step": 112700 }, { "epoch": 6.597648710300053, "grad_norm": 0.22756575047969818, "learning_rate": 0.001, "loss": 2.0944, "step": 112800 }, { "epoch": 6.603497689653156, "grad_norm": 0.21814322471618652, "learning_rate": 0.001, "loss": 2.1002, "step": 112900 }, { "epoch": 6.609346669006259, "grad_norm": 0.2669038474559784, "learning_rate": 0.001, "loss": 2.0918, "step": 113000 }, { "epoch": 6.615195648359361, "grad_norm": 0.2520856261253357, "learning_rate": 0.001, "loss": 2.0894, "step": 113100 }, { "epoch": 6.621044627712465, "grad_norm": 0.20476137101650238, "learning_rate": 0.001, "loss": 2.0952, "step": 113200 }, { "epoch": 6.626893607065567, "grad_norm": 0.2697194814682007, "learning_rate": 0.001, "loss": 2.0994, "step": 113300 }, { "epoch": 6.63274258641867, "grad_norm": 0.24247850477695465, "learning_rate": 0.001, "loss": 2.0908, "step": 113400 }, { "epoch": 6.6385915657717725, "grad_norm": 0.2710765302181244, "learning_rate": 0.001, "loss": 2.0918, "step": 113500 }, { "epoch": 6.644440545124875, "grad_norm": 0.23569287359714508, "learning_rate": 0.001, "loss": 2.0976, "step": 113600 }, { "epoch": 6.650289524477978, "grad_norm": 0.3012280762195587, "learning_rate": 0.001, "loss": 2.0924, "step": 113700 }, { "epoch": 6.656138503831081, "grad_norm": 0.2618626058101654, "learning_rate": 0.001, "loss": 2.0994, "step": 113800 }, { "epoch": 6.661987483184184, "grad_norm": 0.24895215034484863, "learning_rate": 0.001, "loss": 2.1014, "step": 113900 }, { "epoch": 6.667836462537287, "grad_norm": 0.2512902617454529, "learning_rate": 0.001, "loss": 2.0985, "step": 114000 }, { "epoch": 6.67368544189039, "grad_norm": 0.3379908800125122, "learning_rate": 0.001, "loss": 2.0972, "step": 114100 }, { "epoch": 6.679534421243493, "grad_norm": 0.2907579243183136, "learning_rate": 0.001, "loss": 2.0967, "step": 114200 }, { "epoch": 6.685383400596596, "grad_norm": 0.22269020974636078, "learning_rate": 0.001, "loss": 2.0963, "step": 114300 }, { "epoch": 6.691232379949699, "grad_norm": 0.22042356431484222, "learning_rate": 0.001, "loss": 2.0918, "step": 114400 }, { "epoch": 6.697081359302802, "grad_norm": 0.23010016977787018, "learning_rate": 0.001, "loss": 2.0945, "step": 114500 }, { "epoch": 6.7029303386559045, "grad_norm": 0.28595104813575745, "learning_rate": 0.001, "loss": 2.0924, "step": 114600 }, { "epoch": 6.708779318009007, "grad_norm": 0.2763291001319885, "learning_rate": 0.001, "loss": 2.0976, "step": 114700 }, { "epoch": 6.71462829736211, "grad_norm": 0.26177605986595154, "learning_rate": 0.001, "loss": 2.0962, "step": 114800 }, { "epoch": 6.720477276715213, "grad_norm": 0.20230534672737122, "learning_rate": 0.001, "loss": 2.0901, "step": 114900 }, { "epoch": 6.726326256068316, "grad_norm": 0.29557478427886963, "learning_rate": 0.001, "loss": 2.0979, "step": 115000 }, { "epoch": 6.732175235421419, "grad_norm": 0.2838312089443207, "learning_rate": 0.001, "loss": 2.0939, "step": 115100 }, { "epoch": 6.738024214774522, "grad_norm": 0.34011566638946533, "learning_rate": 0.001, "loss": 2.0889, "step": 115200 }, { "epoch": 6.743873194127625, "grad_norm": 0.27445369958877563, "learning_rate": 0.001, "loss": 2.0934, "step": 115300 }, { "epoch": 6.749722173480728, "grad_norm": 0.24660727381706238, "learning_rate": 0.001, "loss": 2.0908, "step": 115400 }, { "epoch": 6.755571152833831, "grad_norm": 0.41375404596328735, "learning_rate": 0.001, "loss": 2.1017, "step": 115500 }, { "epoch": 6.761420132186934, "grad_norm": 0.21654610335826874, "learning_rate": 0.001, "loss": 2.0951, "step": 115600 }, { "epoch": 6.7672691115400365, "grad_norm": 0.2323935478925705, "learning_rate": 0.001, "loss": 2.0951, "step": 115700 }, { "epoch": 6.773118090893139, "grad_norm": 0.3448793888092041, "learning_rate": 0.001, "loss": 2.0988, "step": 115800 }, { "epoch": 6.778967070246242, "grad_norm": 0.28353142738342285, "learning_rate": 0.001, "loss": 2.0948, "step": 115900 }, { "epoch": 6.784816049599345, "grad_norm": 0.27857154607772827, "learning_rate": 0.001, "loss": 2.0949, "step": 116000 }, { "epoch": 6.790665028952448, "grad_norm": 0.2636224329471588, "learning_rate": 0.001, "loss": 2.0927, "step": 116100 }, { "epoch": 6.79651400830555, "grad_norm": 0.17908278107643127, "learning_rate": 0.001, "loss": 2.0819, "step": 116200 }, { "epoch": 6.802362987658654, "grad_norm": 0.27752840518951416, "learning_rate": 0.001, "loss": 2.0954, "step": 116300 }, { "epoch": 6.808211967011756, "grad_norm": 0.21478910744190216, "learning_rate": 0.001, "loss": 2.0924, "step": 116400 }, { "epoch": 6.81406094636486, "grad_norm": 0.29494813084602356, "learning_rate": 0.001, "loss": 2.0927, "step": 116500 }, { "epoch": 6.819909925717962, "grad_norm": 0.21078239381313324, "learning_rate": 0.001, "loss": 2.0919, "step": 116600 }, { "epoch": 6.825758905071065, "grad_norm": 0.23149918019771576, "learning_rate": 0.001, "loss": 2.0937, "step": 116700 }, { "epoch": 6.831607884424168, "grad_norm": 0.2725890278816223, "learning_rate": 0.001, "loss": 2.0902, "step": 116800 }, { "epoch": 6.837456863777271, "grad_norm": 0.1780610829591751, "learning_rate": 0.001, "loss": 2.0933, "step": 116900 }, { "epoch": 6.8433058431303735, "grad_norm": 0.27764075994491577, "learning_rate": 0.001, "loss": 2.0884, "step": 117000 }, { "epoch": 6.849154822483476, "grad_norm": 0.26406729221343994, "learning_rate": 0.001, "loss": 2.0892, "step": 117100 }, { "epoch": 6.855003801836579, "grad_norm": 0.24047105014324188, "learning_rate": 0.001, "loss": 2.0892, "step": 117200 }, { "epoch": 6.860852781189682, "grad_norm": 0.22554342448711395, "learning_rate": 0.001, "loss": 2.0964, "step": 117300 }, { "epoch": 6.866701760542785, "grad_norm": 0.23797762393951416, "learning_rate": 0.001, "loss": 2.0919, "step": 117400 }, { "epoch": 6.872550739895888, "grad_norm": 0.24283836781978607, "learning_rate": 0.001, "loss": 2.0888, "step": 117500 }, { "epoch": 6.878399719248991, "grad_norm": 0.2676140069961548, "learning_rate": 0.001, "loss": 2.0958, "step": 117600 }, { "epoch": 6.884248698602094, "grad_norm": 0.24783118069171906, "learning_rate": 0.001, "loss": 2.0932, "step": 117700 }, { "epoch": 6.890097677955197, "grad_norm": 0.40542346239089966, "learning_rate": 0.001, "loss": 2.0954, "step": 117800 }, { "epoch": 6.8959466573083, "grad_norm": 0.24625609815120697, "learning_rate": 0.001, "loss": 2.0976, "step": 117900 }, { "epoch": 6.901795636661403, "grad_norm": 0.28968632221221924, "learning_rate": 0.001, "loss": 2.0907, "step": 118000 }, { "epoch": 6.9076446160145055, "grad_norm": 0.26504454016685486, "learning_rate": 0.001, "loss": 2.0899, "step": 118100 }, { "epoch": 6.913493595367608, "grad_norm": 0.36727505922317505, "learning_rate": 0.001, "loss": 2.0904, "step": 118200 }, { "epoch": 6.919342574720711, "grad_norm": 0.30778583884239197, "learning_rate": 0.001, "loss": 2.088, "step": 118300 }, { "epoch": 6.925191554073814, "grad_norm": 0.33569571375846863, "learning_rate": 0.001, "loss": 2.0938, "step": 118400 }, { "epoch": 6.931040533426917, "grad_norm": 0.28838837146759033, "learning_rate": 0.001, "loss": 2.0955, "step": 118500 }, { "epoch": 6.93688951278002, "grad_norm": 0.2521663308143616, "learning_rate": 0.001, "loss": 2.092, "step": 118600 }, { "epoch": 6.942738492133123, "grad_norm": 0.21847376227378845, "learning_rate": 0.001, "loss": 2.0923, "step": 118700 }, { "epoch": 6.948587471486226, "grad_norm": 0.24902141094207764, "learning_rate": 0.001, "loss": 2.0934, "step": 118800 }, { "epoch": 6.954436450839329, "grad_norm": 0.26442793011665344, "learning_rate": 0.001, "loss": 2.0921, "step": 118900 }, { "epoch": 6.960285430192432, "grad_norm": 0.24910275638103485, "learning_rate": 0.001, "loss": 2.0934, "step": 119000 }, { "epoch": 6.966134409545535, "grad_norm": 0.21298056840896606, "learning_rate": 0.001, "loss": 2.088, "step": 119100 }, { "epoch": 6.971983388898638, "grad_norm": 0.2573557198047638, "learning_rate": 0.001, "loss": 2.0905, "step": 119200 }, { "epoch": 6.97783236825174, "grad_norm": 0.2621632218360901, "learning_rate": 0.001, "loss": 2.0888, "step": 119300 }, { "epoch": 6.983681347604843, "grad_norm": 0.28074780106544495, "learning_rate": 0.001, "loss": 2.0924, "step": 119400 }, { "epoch": 6.989530326957945, "grad_norm": 0.27343764901161194, "learning_rate": 0.001, "loss": 2.0952, "step": 119500 }, { "epoch": 6.995379306311049, "grad_norm": 0.22017066180706024, "learning_rate": 0.001, "loss": 2.0894, "step": 119600 }, { "epoch": 7.001228285664151, "grad_norm": 0.20198820531368256, "learning_rate": 0.001, "loss": 2.0866, "step": 119700 }, { "epoch": 7.007077265017254, "grad_norm": 0.3418391942977905, "learning_rate": 0.001, "loss": 2.0739, "step": 119800 }, { "epoch": 7.012926244370357, "grad_norm": 0.20784956216812134, "learning_rate": 0.001, "loss": 2.0765, "step": 119900 }, { "epoch": 7.01877522372346, "grad_norm": 0.2523118257522583, "learning_rate": 0.001, "loss": 2.0712, "step": 120000 }, { "epoch": 7.024624203076563, "grad_norm": 0.2682495415210724, "learning_rate": 0.001, "loss": 2.07, "step": 120100 }, { "epoch": 7.030473182429666, "grad_norm": 0.3094593584537506, "learning_rate": 0.001, "loss": 2.0719, "step": 120200 }, { "epoch": 7.036322161782769, "grad_norm": 0.33576640486717224, "learning_rate": 0.001, "loss": 2.0708, "step": 120300 }, { "epoch": 7.042171141135872, "grad_norm": 0.3821800947189331, "learning_rate": 0.001, "loss": 2.077, "step": 120400 }, { "epoch": 7.0480201204889745, "grad_norm": 0.2580956816673279, "learning_rate": 0.001, "loss": 2.0804, "step": 120500 }, { "epoch": 7.053869099842077, "grad_norm": 0.20208613574504852, "learning_rate": 0.001, "loss": 2.0679, "step": 120600 }, { "epoch": 7.05971807919518, "grad_norm": 0.24920496344566345, "learning_rate": 0.001, "loss": 2.0677, "step": 120700 }, { "epoch": 7.065567058548283, "grad_norm": 0.18436652421951294, "learning_rate": 0.001, "loss": 2.0747, "step": 120800 }, { "epoch": 7.071416037901386, "grad_norm": 0.2504202425479889, "learning_rate": 0.001, "loss": 2.0798, "step": 120900 }, { "epoch": 7.077265017254489, "grad_norm": 0.4639687240123749, "learning_rate": 0.001, "loss": 2.0856, "step": 121000 }, { "epoch": 7.083113996607592, "grad_norm": 0.22593283653259277, "learning_rate": 0.001, "loss": 2.0742, "step": 121100 }, { "epoch": 7.088962975960695, "grad_norm": 0.2184661328792572, "learning_rate": 0.001, "loss": 2.0697, "step": 121200 }, { "epoch": 7.094811955313798, "grad_norm": 0.31937524676322937, "learning_rate": 0.001, "loss": 2.0767, "step": 121300 }, { "epoch": 7.100660934666901, "grad_norm": 0.3400152325630188, "learning_rate": 0.001, "loss": 2.0703, "step": 121400 }, { "epoch": 7.106509914020004, "grad_norm": 0.2653490900993347, "learning_rate": 0.001, "loss": 2.0756, "step": 121500 }, { "epoch": 7.112358893373107, "grad_norm": 0.390118807554245, "learning_rate": 0.001, "loss": 2.0802, "step": 121600 }, { "epoch": 7.1182078727262095, "grad_norm": 0.21510018408298492, "learning_rate": 0.001, "loss": 2.0743, "step": 121700 }, { "epoch": 7.124056852079312, "grad_norm": 0.27118805050849915, "learning_rate": 0.001, "loss": 2.077, "step": 121800 }, { "epoch": 7.129905831432415, "grad_norm": 0.22034794092178345, "learning_rate": 0.001, "loss": 2.0724, "step": 121900 }, { "epoch": 7.135754810785518, "grad_norm": 0.27924802899360657, "learning_rate": 0.001, "loss": 2.0736, "step": 122000 }, { "epoch": 7.141603790138621, "grad_norm": 0.2775343358516693, "learning_rate": 0.001, "loss": 2.0733, "step": 122100 }, { "epoch": 7.147452769491724, "grad_norm": 0.2640744745731354, "learning_rate": 0.001, "loss": 2.0755, "step": 122200 }, { "epoch": 7.153301748844827, "grad_norm": 0.26359713077545166, "learning_rate": 0.001, "loss": 2.0753, "step": 122300 }, { "epoch": 7.15915072819793, "grad_norm": 0.1970929205417633, "learning_rate": 0.001, "loss": 2.0778, "step": 122400 }, { "epoch": 7.164999707551033, "grad_norm": 0.28102871775627136, "learning_rate": 0.001, "loss": 2.0821, "step": 122500 }, { "epoch": 7.170848686904135, "grad_norm": 0.32656770944595337, "learning_rate": 0.001, "loss": 2.0805, "step": 122600 }, { "epoch": 7.176697666257238, "grad_norm": 0.22590653598308563, "learning_rate": 0.001, "loss": 2.0835, "step": 122700 }, { "epoch": 7.182546645610341, "grad_norm": 0.24680742621421814, "learning_rate": 0.001, "loss": 2.0746, "step": 122800 }, { "epoch": 7.1883956249634435, "grad_norm": 0.23503178358078003, "learning_rate": 0.001, "loss": 2.0753, "step": 122900 }, { "epoch": 7.194244604316546, "grad_norm": 0.23945532739162445, "learning_rate": 0.001, "loss": 2.0734, "step": 123000 }, { "epoch": 7.200093583669649, "grad_norm": 0.24956107139587402, "learning_rate": 0.001, "loss": 2.0752, "step": 123100 }, { "epoch": 7.205942563022752, "grad_norm": 0.22509053349494934, "learning_rate": 0.001, "loss": 2.0748, "step": 123200 }, { "epoch": 7.211791542375855, "grad_norm": 0.2578199505805969, "learning_rate": 0.001, "loss": 2.074, "step": 123300 }, { "epoch": 7.217640521728958, "grad_norm": 0.23456628620624542, "learning_rate": 0.001, "loss": 2.0713, "step": 123400 }, { "epoch": 7.223489501082061, "grad_norm": 0.2689022123813629, "learning_rate": 0.001, "loss": 2.0776, "step": 123500 }, { "epoch": 7.229338480435164, "grad_norm": 0.2737118899822235, "learning_rate": 0.001, "loss": 2.0749, "step": 123600 }, { "epoch": 7.235187459788267, "grad_norm": 0.37303411960601807, "learning_rate": 0.001, "loss": 2.0826, "step": 123700 }, { "epoch": 7.24103643914137, "grad_norm": 0.24357691407203674, "learning_rate": 0.001, "loss": 2.076, "step": 123800 }, { "epoch": 7.246885418494473, "grad_norm": 0.23624470829963684, "learning_rate": 0.001, "loss": 2.0698, "step": 123900 }, { "epoch": 7.252734397847576, "grad_norm": 0.21941761672496796, "learning_rate": 0.001, "loss": 2.075, "step": 124000 }, { "epoch": 7.2585833772006785, "grad_norm": 0.2654819190502167, "learning_rate": 0.001, "loss": 2.0725, "step": 124100 }, { "epoch": 7.264432356553781, "grad_norm": 0.24955390393733978, "learning_rate": 0.001, "loss": 2.0702, "step": 124200 }, { "epoch": 7.270281335906884, "grad_norm": 0.26887187361717224, "learning_rate": 0.001, "loss": 2.071, "step": 124300 }, { "epoch": 7.276130315259987, "grad_norm": 0.3310205936431885, "learning_rate": 0.001, "loss": 2.0728, "step": 124400 }, { "epoch": 7.28197929461309, "grad_norm": 0.2011602520942688, "learning_rate": 0.001, "loss": 2.0754, "step": 124500 }, { "epoch": 7.287828273966193, "grad_norm": 0.27676528692245483, "learning_rate": 0.001, "loss": 2.0729, "step": 124600 }, { "epoch": 7.293677253319296, "grad_norm": 0.22343049943447113, "learning_rate": 0.001, "loss": 2.073, "step": 124700 }, { "epoch": 7.299526232672399, "grad_norm": 0.25157618522644043, "learning_rate": 0.001, "loss": 2.0753, "step": 124800 }, { "epoch": 7.305375212025502, "grad_norm": 0.25796037912368774, "learning_rate": 0.001, "loss": 2.0725, "step": 124900 }, { "epoch": 7.311224191378605, "grad_norm": 0.22466489672660828, "learning_rate": 0.001, "loss": 2.0767, "step": 125000 }, { "epoch": 7.311224191378605, "eval_ag_news_accuracy": 0.222546875, "eval_ag_news_bleu_score": 4.70030450513735, "eval_ag_news_bleu_score_sem": 0.3379984471364544, "eval_ag_news_emb_cos_sim": 0.6468570232391357, "eval_ag_news_emb_cos_sim_sem": 0.014782190322875977, "eval_ag_news_emb_top1_equal": 0.9140625, "eval_ag_news_emb_top1_equal_sem": 0.024870097637176514, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.8832719326019287, "eval_ag_news_n_ngrams_match_1": 11.390625, "eval_ag_news_n_ngrams_match_2": 2.53125, "eval_ag_news_n_ngrams_match_3": 0.859375, "eval_ag_news_num_pred_words": 50.25, "eval_ag_news_num_true_words": 44.7734375, "eval_ag_news_perplexity": 17.872655740625877, "eval_ag_news_pred_num_tokens": 83.515625, "eval_ag_news_rouge_score": 0.2302554494572775, "eval_ag_news_runtime": 76.8245, "eval_ag_news_samples_per_second": 6.508, "eval_ag_news_steps_per_second": 0.013, "eval_ag_news_token_set_f1": 0.27607731774242944, "eval_ag_news_token_set_f1_sem": 0.009395422572535119, "eval_ag_news_token_set_precision": 0.25610686239693004, "eval_ag_news_token_set_recall": 0.3157671617060658, "eval_ag_news_true_num_tokens": 63.0625, "step": 125000 }, { "epoch": 7.311224191378605, "eval_anthropic_toxic_prompts_accuracy": 0.09809375, "eval_anthropic_toxic_prompts_bleu_score": 40.11462631977428, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.726139982753297, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.873968243598938, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010709733702242374, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1484375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0315484639796987, "eval_anthropic_toxic_prompts_loss": 1.3622822761535645, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.8125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.0859375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.1171875, "eval_anthropic_toxic_prompts_num_pred_words": 15.2734375, "eval_anthropic_toxic_prompts_num_true_words": 14.7890625, "eval_anthropic_toxic_prompts_perplexity": 3.905095645797094, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.046875, "eval_anthropic_toxic_prompts_rouge_score": 0.6712480140122941, "eval_anthropic_toxic_prompts_runtime": 31.7191, "eval_anthropic_toxic_prompts_samples_per_second": 15.763, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.6944590445730486, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01889699388827009, "eval_anthropic_toxic_prompts_token_set_precision": 0.6928681619628541, "eval_anthropic_toxic_prompts_token_set_recall": 0.7020414342143727, "eval_anthropic_toxic_prompts_true_num_tokens": 18.34375, "step": 125000 }, { "epoch": 7.311224191378605, "eval_arxiv_accuracy": 0.364703125, "eval_arxiv_bleu_score": 1.5295910163503852, "eval_arxiv_bleu_score_sem": 0.17383002223457839, "eval_arxiv_emb_cos_sim": 0.40495359897613525, "eval_arxiv_emb_cos_sim_sem": 0.018030690029263496, "eval_arxiv_emb_top1_equal": 0.875, "eval_arxiv_emb_top1_equal_sem": 0.029346559196710587, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.5039260387420654, "eval_arxiv_n_ngrams_match_1": 10.796875, "eval_arxiv_n_ngrams_match_2": 1.765625, "eval_arxiv_n_ngrams_match_3": 0.4140625, "eval_arxiv_num_pred_words": 51.2109375, "eval_arxiv_num_true_words": 83.546875, "eval_arxiv_perplexity": 33.24572005751762, "eval_arxiv_pred_num_tokens": 126.109375, "eval_arxiv_rouge_score": 0.15054685967336567, "eval_arxiv_runtime": 30.3856, "eval_arxiv_samples_per_second": 16.455, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.15032508667212885, "eval_arxiv_token_set_f1_sem": 0.008614354894372699, "eval_arxiv_token_set_precision": 0.09920766247705597, "eval_arxiv_token_set_recall": 0.3948075118822808, "eval_arxiv_true_num_tokens": 124.3125, "step": 125000 }, { "epoch": 7.311224191378605, "eval_python_code_alpaca_accuracy": 0.12378125, "eval_python_code_alpaca_bleu_score": 24.628686674922008, "eval_python_code_alpaca_bleu_score_sem": 1.4975897037376449, "eval_python_code_alpaca_emb_cos_sim": 0.8404046297073364, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011057025752961636, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.557314157485962, "eval_python_code_alpaca_n_ngrams_match_1": 9.765625, "eval_python_code_alpaca_n_ngrams_match_2": 5.03125, "eval_python_code_alpaca_n_ngrams_match_3": 2.6796875, "eval_python_code_alpaca_num_pred_words": 17.171875, "eval_python_code_alpaca_num_true_words": 18.4140625, "eval_python_code_alpaca_perplexity": 4.746056949837647, "eval_python_code_alpaca_pred_num_tokens": 23.578125, "eval_python_code_alpaca_rouge_score": 0.5651678154970192, "eval_python_code_alpaca_runtime": 30.0246, "eval_python_code_alpaca_samples_per_second": 16.653, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.5888641147781194, "eval_python_code_alpaca_token_set_f1_sem": 0.012978647444148592, "eval_python_code_alpaca_token_set_precision": 0.5694697562118086, "eval_python_code_alpaca_token_set_recall": 0.6183146320609931, "eval_python_code_alpaca_true_num_tokens": 23.6875, "step": 125000 }, { "epoch": 7.311224191378605, "eval_wikibio_accuracy": 0.358640625, "eval_wikibio_bleu_score": 7.1124266643148335, "eval_wikibio_bleu_score_sem": 0.758111872556493, "eval_wikibio_emb_cos_sim": 0.5155909061431885, "eval_wikibio_emb_cos_sim_sem": 0.02470424585044384, "eval_wikibio_emb_top1_equal": 0.8359375, "eval_wikibio_emb_top1_equal_sem": 0.032861676067113876, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.799502372741699, "eval_wikibio_n_ngrams_match_1": 12.9453125, "eval_wikibio_n_ngrams_match_2": 4.4453125, "eval_wikibio_n_ngrams_match_3": 1.921875, "eval_wikibio_num_pred_words": 49.0390625, "eval_wikibio_num_true_words": 49.1015625, "eval_wikibio_perplexity": 16.43646550239061, "eval_wikibio_pred_num_tokens": 109.8203125, "eval_wikibio_rouge_score": 0.25226226101435345, "eval_wikibio_runtime": 30.3845, "eval_wikibio_samples_per_second": 16.456, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2820998904699641, "eval_wikibio_token_set_f1_sem": 0.014674643530311634, "eval_wikibio_token_set_precision": 0.247712191879688, "eval_wikibio_token_set_recall": 0.36390491505749173, "eval_wikibio_true_num_tokens": 95.4296875, "step": 125000 }, { "epoch": 7.311224191378605, "eval_msmarco_accuracy": 0.368671875, "eval_msmarco_bleu_score": 13.807021444597407, "eval_msmarco_bleu_score_sem": 1.2023365693708084, "eval_msmarco_emb_cos_sim": 0.7132757902145386, "eval_msmarco_emb_cos_sim_sem": 0.01909194141626358, "eval_msmarco_emb_top1_equal": 0.9296875, "eval_msmarco_emb_top1_equal_sem": 0.022687306627631187, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.9546986818313599, "eval_msmarco_n_ngrams_match_1": 26.1953125, "eval_msmarco_n_ngrams_match_2": 10.5859375, "eval_msmarco_n_ngrams_match_3": 5.6640625, "eval_msmarco_num_pred_words": 64.203125, "eval_msmarco_num_true_words": 62.34375, "eval_msmarco_perplexity": 7.061790857201284, "eval_msmarco_pred_num_tokens": 90.0078125, "eval_msmarco_rouge_score": 0.3989978986817838, "eval_msmarco_runtime": 25.7451, "eval_msmarco_samples_per_second": 19.421, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.42740223117514353, "eval_msmarco_token_set_f1_sem": 0.01315764458110019, "eval_msmarco_token_set_precision": 0.38473072499185756, "eval_msmarco_token_set_recall": 0.5064837834499109, "eval_msmarco_true_num_tokens": 80.390625, "step": 125000 }, { "epoch": 7.317073170731708, "grad_norm": 0.2765621542930603, "learning_rate": 0.001, "loss": 2.0738, "step": 125100 }, { "epoch": 7.3229221500848105, "grad_norm": 0.23979748785495758, "learning_rate": 0.001, "loss": 2.0789, "step": 125200 }, { "epoch": 7.328771129437913, "grad_norm": 0.32794421911239624, "learning_rate": 0.001, "loss": 2.0837, "step": 125300 }, { "epoch": 7.334620108791016, "grad_norm": 0.28452053666114807, "learning_rate": 0.001, "loss": 2.0739, "step": 125400 }, { "epoch": 7.340469088144119, "grad_norm": 0.17465296387672424, "learning_rate": 0.001, "loss": 2.0704, "step": 125500 }, { "epoch": 7.346318067497222, "grad_norm": 0.22225384414196014, "learning_rate": 0.001, "loss": 2.068, "step": 125600 }, { "epoch": 7.352167046850324, "grad_norm": 0.23989394307136536, "learning_rate": 0.001, "loss": 2.0688, "step": 125700 }, { "epoch": 7.358016026203427, "grad_norm": 0.18017326295375824, "learning_rate": 0.001, "loss": 2.0694, "step": 125800 }, { "epoch": 7.36386500555653, "grad_norm": 0.21585138142108917, "learning_rate": 0.001, "loss": 2.0714, "step": 125900 }, { "epoch": 7.369713984909633, "grad_norm": 0.2795255780220032, "learning_rate": 0.001, "loss": 2.0769, "step": 126000 }, { "epoch": 7.375562964262736, "grad_norm": 0.24513109028339386, "learning_rate": 0.001, "loss": 2.072, "step": 126100 }, { "epoch": 7.381411943615839, "grad_norm": 0.2730487585067749, "learning_rate": 0.001, "loss": 2.0776, "step": 126200 }, { "epoch": 7.387260922968942, "grad_norm": 0.25820598006248474, "learning_rate": 0.001, "loss": 2.07, "step": 126300 }, { "epoch": 7.393109902322045, "grad_norm": 0.249063178896904, "learning_rate": 0.001, "loss": 2.0776, "step": 126400 }, { "epoch": 7.3989588816751475, "grad_norm": 0.3316287696361542, "learning_rate": 0.001, "loss": 2.0734, "step": 126500 }, { "epoch": 7.40480786102825, "grad_norm": 0.3540516197681427, "learning_rate": 0.001, "loss": 2.0771, "step": 126600 }, { "epoch": 7.410656840381353, "grad_norm": 0.2837873101234436, "learning_rate": 0.001, "loss": 2.0822, "step": 126700 }, { "epoch": 7.416505819734456, "grad_norm": 0.18533584475517273, "learning_rate": 0.001, "loss": 2.0745, "step": 126800 }, { "epoch": 7.422354799087559, "grad_norm": 0.27746325731277466, "learning_rate": 0.001, "loss": 2.08, "step": 126900 }, { "epoch": 7.428203778440662, "grad_norm": 0.2199859470129013, "learning_rate": 0.001, "loss": 2.0699, "step": 127000 }, { "epoch": 7.434052757793765, "grad_norm": 0.3435116410255432, "learning_rate": 0.001, "loss": 2.0696, "step": 127100 }, { "epoch": 7.439901737146868, "grad_norm": 0.2745727598667145, "learning_rate": 0.001, "loss": 2.0775, "step": 127200 }, { "epoch": 7.445750716499971, "grad_norm": 0.19565510749816895, "learning_rate": 0.001, "loss": 2.0828, "step": 127300 }, { "epoch": 7.451599695853074, "grad_norm": 0.2935641407966614, "learning_rate": 0.001, "loss": 2.0692, "step": 127400 }, { "epoch": 7.457448675206177, "grad_norm": 0.21367697417736053, "learning_rate": 0.001, "loss": 2.0713, "step": 127500 }, { "epoch": 7.4632976545592795, "grad_norm": 0.21580560505390167, "learning_rate": 0.001, "loss": 2.0671, "step": 127600 }, { "epoch": 7.469146633912382, "grad_norm": 0.323560506105423, "learning_rate": 0.001, "loss": 2.0698, "step": 127700 }, { "epoch": 7.474995613265485, "grad_norm": 0.29000452160835266, "learning_rate": 0.001, "loss": 2.0719, "step": 127800 }, { "epoch": 7.480844592618588, "grad_norm": 0.17351895570755005, "learning_rate": 0.001, "loss": 2.0724, "step": 127900 }, { "epoch": 7.486693571971691, "grad_norm": 0.25914520025253296, "learning_rate": 0.001, "loss": 2.0743, "step": 128000 }, { "epoch": 7.492542551324794, "grad_norm": 0.23491768538951874, "learning_rate": 0.001, "loss": 2.0786, "step": 128100 }, { "epoch": 7.498391530677897, "grad_norm": 0.28302115201950073, "learning_rate": 0.001, "loss": 2.071, "step": 128200 }, { "epoch": 7.504240510031, "grad_norm": 0.429107666015625, "learning_rate": 0.001, "loss": 2.078, "step": 128300 }, { "epoch": 7.510089489384103, "grad_norm": 0.3500305712223053, "learning_rate": 0.001, "loss": 2.0818, "step": 128400 }, { "epoch": 7.515938468737206, "grad_norm": 0.1857694387435913, "learning_rate": 0.001, "loss": 2.0717, "step": 128500 }, { "epoch": 7.521787448090308, "grad_norm": 0.23258157074451447, "learning_rate": 0.001, "loss": 2.0736, "step": 128600 }, { "epoch": 7.5276364274434115, "grad_norm": 0.2636478841304779, "learning_rate": 0.001, "loss": 2.0715, "step": 128700 }, { "epoch": 7.533485406796514, "grad_norm": 0.2792002558708191, "learning_rate": 0.001, "loss": 2.076, "step": 128800 }, { "epoch": 7.539334386149617, "grad_norm": 0.2696416676044464, "learning_rate": 0.001, "loss": 2.0735, "step": 128900 }, { "epoch": 7.545183365502719, "grad_norm": 0.2411583662033081, "learning_rate": 0.001, "loss": 2.0705, "step": 129000 }, { "epoch": 7.551032344855822, "grad_norm": 0.3007240891456604, "learning_rate": 0.001, "loss": 2.0691, "step": 129100 }, { "epoch": 7.556881324208925, "grad_norm": 0.2479754239320755, "learning_rate": 0.001, "loss": 2.0691, "step": 129200 }, { "epoch": 7.562730303562028, "grad_norm": 0.23146720230579376, "learning_rate": 0.001, "loss": 2.0708, "step": 129300 }, { "epoch": 7.568579282915131, "grad_norm": 0.2413623183965683, "learning_rate": 0.001, "loss": 2.0734, "step": 129400 }, { "epoch": 7.574428262268234, "grad_norm": 0.22136470675468445, "learning_rate": 0.001, "loss": 2.071, "step": 129500 }, { "epoch": 7.580277241621337, "grad_norm": 0.24660733342170715, "learning_rate": 0.001, "loss": 2.0711, "step": 129600 }, { "epoch": 7.58612622097444, "grad_norm": 0.30816319584846497, "learning_rate": 0.001, "loss": 2.0762, "step": 129700 }, { "epoch": 7.591975200327543, "grad_norm": 0.26505714654922485, "learning_rate": 0.001, "loss": 2.0786, "step": 129800 }, { "epoch": 7.597824179680646, "grad_norm": 0.22754602134227753, "learning_rate": 0.001, "loss": 2.0726, "step": 129900 }, { "epoch": 7.6036731590337485, "grad_norm": 0.29982227087020874, "learning_rate": 0.001, "loss": 2.0751, "step": 130000 }, { "epoch": 7.609522138386851, "grad_norm": 0.1988113522529602, "learning_rate": 0.001, "loss": 2.0667, "step": 130100 }, { "epoch": 7.615371117739954, "grad_norm": 0.1933000683784485, "learning_rate": 0.001, "loss": 2.0736, "step": 130200 }, { "epoch": 7.621220097093057, "grad_norm": 0.2426053136587143, "learning_rate": 0.001, "loss": 2.0698, "step": 130300 }, { "epoch": 7.62706907644616, "grad_norm": 0.3672468960285187, "learning_rate": 0.001, "loss": 2.0784, "step": 130400 }, { "epoch": 7.632918055799263, "grad_norm": 0.2067863494157791, "learning_rate": 0.001, "loss": 2.0732, "step": 130500 }, { "epoch": 7.638767035152366, "grad_norm": 0.26048386096954346, "learning_rate": 0.001, "loss": 2.0694, "step": 130600 }, { "epoch": 7.644616014505469, "grad_norm": 0.3110027611255646, "learning_rate": 0.001, "loss": 2.0757, "step": 130700 }, { "epoch": 7.650464993858572, "grad_norm": 0.31026890873908997, "learning_rate": 0.001, "loss": 2.0794, "step": 130800 }, { "epoch": 7.656313973211675, "grad_norm": 0.2511734068393707, "learning_rate": 0.001, "loss": 2.0695, "step": 130900 }, { "epoch": 7.662162952564778, "grad_norm": 0.23310036957263947, "learning_rate": 0.001, "loss": 2.0684, "step": 131000 }, { "epoch": 7.6680119319178806, "grad_norm": 0.22600974142551422, "learning_rate": 0.001, "loss": 2.0671, "step": 131100 }, { "epoch": 7.6738609112709835, "grad_norm": 0.23699116706848145, "learning_rate": 0.001, "loss": 2.0695, "step": 131200 }, { "epoch": 7.679709890624086, "grad_norm": 0.22750291228294373, "learning_rate": 0.001, "loss": 2.0759, "step": 131300 }, { "epoch": 7.685558869977189, "grad_norm": 0.2528931796550751, "learning_rate": 0.001, "loss": 2.0724, "step": 131400 }, { "epoch": 7.691407849330292, "grad_norm": 0.27370485663414, "learning_rate": 0.001, "loss": 2.0707, "step": 131500 }, { "epoch": 7.697256828683395, "grad_norm": 0.3002682030200958, "learning_rate": 0.001, "loss": 2.074, "step": 131600 }, { "epoch": 7.703105808036497, "grad_norm": 0.41661080718040466, "learning_rate": 0.001, "loss": 2.0747, "step": 131700 }, { "epoch": 7.708954787389601, "grad_norm": 0.252957284450531, "learning_rate": 0.001, "loss": 2.0676, "step": 131800 }, { "epoch": 7.714803766742703, "grad_norm": 0.2418520152568817, "learning_rate": 0.001, "loss": 2.066, "step": 131900 }, { "epoch": 7.720652746095807, "grad_norm": 0.2774057388305664, "learning_rate": 0.001, "loss": 2.0735, "step": 132000 }, { "epoch": 7.726501725448909, "grad_norm": 0.27182286977767944, "learning_rate": 0.001, "loss": 2.0633, "step": 132100 }, { "epoch": 7.732350704802012, "grad_norm": 0.36273935437202454, "learning_rate": 0.001, "loss": 2.0718, "step": 132200 }, { "epoch": 7.738199684155115, "grad_norm": 0.2648675739765167, "learning_rate": 0.001, "loss": 2.0867, "step": 132300 }, { "epoch": 7.7440486635082175, "grad_norm": 0.286564439535141, "learning_rate": 0.001, "loss": 2.0716, "step": 132400 }, { "epoch": 7.74989764286132, "grad_norm": 0.2301120012998581, "learning_rate": 0.001, "loss": 2.0739, "step": 132500 }, { "epoch": 7.755746622214423, "grad_norm": 0.273327499628067, "learning_rate": 0.001, "loss": 2.0683, "step": 132600 }, { "epoch": 7.761595601567526, "grad_norm": 0.23890554904937744, "learning_rate": 0.001, "loss": 2.0748, "step": 132700 }, { "epoch": 7.767444580920629, "grad_norm": 0.26626327633857727, "learning_rate": 0.001, "loss": 2.0728, "step": 132800 }, { "epoch": 7.773293560273732, "grad_norm": 0.2574378550052643, "learning_rate": 0.001, "loss": 2.0699, "step": 132900 }, { "epoch": 7.779142539626835, "grad_norm": 0.3110472559928894, "learning_rate": 0.001, "loss": 2.0729, "step": 133000 }, { "epoch": 7.784991518979938, "grad_norm": 0.2901819944381714, "learning_rate": 0.001, "loss": 2.0774, "step": 133100 }, { "epoch": 7.790840498333041, "grad_norm": 0.20799550414085388, "learning_rate": 0.001, "loss": 2.0719, "step": 133200 }, { "epoch": 7.796689477686144, "grad_norm": 0.24712491035461426, "learning_rate": 0.001, "loss": 2.0698, "step": 133300 }, { "epoch": 7.802538457039247, "grad_norm": 0.2687852382659912, "learning_rate": 0.001, "loss": 2.0691, "step": 133400 }, { "epoch": 7.80838743639235, "grad_norm": 0.2070963978767395, "learning_rate": 0.001, "loss": 2.0675, "step": 133500 }, { "epoch": 7.8142364157454525, "grad_norm": 0.28080347180366516, "learning_rate": 0.001, "loss": 2.0751, "step": 133600 }, { "epoch": 7.820085395098555, "grad_norm": 0.28282490372657776, "learning_rate": 0.001, "loss": 2.0719, "step": 133700 }, { "epoch": 7.825934374451658, "grad_norm": 0.24415090680122375, "learning_rate": 0.001, "loss": 2.0659, "step": 133800 }, { "epoch": 7.831783353804761, "grad_norm": 0.21900001168251038, "learning_rate": 0.001, "loss": 2.0697, "step": 133900 }, { "epoch": 7.837632333157864, "grad_norm": 0.22051143646240234, "learning_rate": 0.001, "loss": 2.068, "step": 134000 }, { "epoch": 7.843481312510967, "grad_norm": 0.2297656238079071, "learning_rate": 0.001, "loss": 2.072, "step": 134100 }, { "epoch": 7.84933029186407, "grad_norm": 0.3511248826980591, "learning_rate": 0.001, "loss": 2.0719, "step": 134200 }, { "epoch": 7.855179271217173, "grad_norm": 0.24551145732402802, "learning_rate": 0.001, "loss": 2.0749, "step": 134300 }, { "epoch": 7.861028250570276, "grad_norm": 0.2336602509021759, "learning_rate": 0.001, "loss": 2.0688, "step": 134400 }, { "epoch": 7.866877229923379, "grad_norm": 0.24629218876361847, "learning_rate": 0.001, "loss": 2.068, "step": 134500 }, { "epoch": 7.872726209276482, "grad_norm": 0.3039599061012268, "learning_rate": 0.001, "loss": 2.0685, "step": 134600 }, { "epoch": 7.8785751886295845, "grad_norm": 0.3489779233932495, "learning_rate": 0.001, "loss": 2.0695, "step": 134700 }, { "epoch": 7.8844241679826865, "grad_norm": 0.24694307148456573, "learning_rate": 0.001, "loss": 2.0722, "step": 134800 }, { "epoch": 7.89027314733579, "grad_norm": 0.2545180320739746, "learning_rate": 0.001, "loss": 2.0704, "step": 134900 }, { "epoch": 7.896122126688892, "grad_norm": 0.24579446017742157, "learning_rate": 0.001, "loss": 2.0657, "step": 135000 }, { "epoch": 7.901971106041996, "grad_norm": 0.282198041677475, "learning_rate": 0.001, "loss": 2.0712, "step": 135100 }, { "epoch": 7.907820085395098, "grad_norm": 0.29648664593696594, "learning_rate": 0.001, "loss": 2.0747, "step": 135200 }, { "epoch": 7.913669064748201, "grad_norm": 0.3147414028644562, "learning_rate": 0.001, "loss": 2.0724, "step": 135300 }, { "epoch": 7.919518044101304, "grad_norm": 0.21171703934669495, "learning_rate": 0.001, "loss": 2.0631, "step": 135400 }, { "epoch": 7.925367023454407, "grad_norm": 0.27054107189178467, "learning_rate": 0.001, "loss": 2.0688, "step": 135500 }, { "epoch": 7.93121600280751, "grad_norm": 0.24845534563064575, "learning_rate": 0.001, "loss": 2.0729, "step": 135600 }, { "epoch": 7.937064982160613, "grad_norm": 0.22791749238967896, "learning_rate": 0.001, "loss": 2.0745, "step": 135700 }, { "epoch": 7.942913961513716, "grad_norm": 0.2767455577850342, "learning_rate": 0.001, "loss": 2.0647, "step": 135800 }, { "epoch": 7.948762940866819, "grad_norm": 0.221189945936203, "learning_rate": 0.001, "loss": 2.0663, "step": 135900 }, { "epoch": 7.9546119202199215, "grad_norm": 0.2426004260778427, "learning_rate": 0.001, "loss": 2.0668, "step": 136000 }, { "epoch": 7.960460899573024, "grad_norm": 0.28624624013900757, "learning_rate": 0.001, "loss": 2.0725, "step": 136100 }, { "epoch": 7.966309878926127, "grad_norm": 0.2273360788822174, "learning_rate": 0.001, "loss": 2.0676, "step": 136200 }, { "epoch": 7.97215885827923, "grad_norm": 0.2087278962135315, "learning_rate": 0.001, "loss": 2.0711, "step": 136300 }, { "epoch": 7.978007837632333, "grad_norm": 0.24168454110622406, "learning_rate": 0.001, "loss": 2.0675, "step": 136400 }, { "epoch": 7.983856816985436, "grad_norm": 0.23527555167675018, "learning_rate": 0.001, "loss": 2.0663, "step": 136500 }, { "epoch": 7.989705796338539, "grad_norm": 0.1981627643108368, "learning_rate": 0.001, "loss": 2.0663, "step": 136600 }, { "epoch": 7.995554775691642, "grad_norm": 0.3156042993068695, "learning_rate": 0.001, "loss": 2.0682, "step": 136700 }, { "epoch": 8.001403755044745, "grad_norm": 0.20328278839588165, "learning_rate": 0.001, "loss": 2.0617, "step": 136800 }, { "epoch": 8.007252734397847, "grad_norm": 0.34675782918930054, "learning_rate": 0.001, "loss": 2.0493, "step": 136900 }, { "epoch": 8.01310171375095, "grad_norm": 0.375158429145813, "learning_rate": 0.001, "loss": 2.0486, "step": 137000 }, { "epoch": 8.018950693104053, "grad_norm": 0.2519529163837433, "learning_rate": 0.001, "loss": 2.0552, "step": 137100 }, { "epoch": 8.024799672457156, "grad_norm": 0.30045610666275024, "learning_rate": 0.001, "loss": 2.0529, "step": 137200 }, { "epoch": 8.030648651810258, "grad_norm": 0.3127552568912506, "learning_rate": 0.001, "loss": 2.0497, "step": 137300 }, { "epoch": 8.036497631163362, "grad_norm": 0.3762386441230774, "learning_rate": 0.001, "loss": 2.0561, "step": 137400 }, { "epoch": 8.042346610516464, "grad_norm": 0.2954087555408478, "learning_rate": 0.001, "loss": 2.0516, "step": 137500 }, { "epoch": 8.048195589869568, "grad_norm": 0.22714410722255707, "learning_rate": 0.001, "loss": 2.0435, "step": 137600 }, { "epoch": 8.05404456922267, "grad_norm": 0.23709549009799957, "learning_rate": 0.001, "loss": 2.0525, "step": 137700 }, { "epoch": 8.059893548575774, "grad_norm": 0.2755180895328522, "learning_rate": 0.001, "loss": 2.0507, "step": 137800 }, { "epoch": 8.065742527928876, "grad_norm": 0.23413032293319702, "learning_rate": 0.001, "loss": 2.0462, "step": 137900 }, { "epoch": 8.07159150728198, "grad_norm": 0.31125113368034363, "learning_rate": 0.001, "loss": 2.052, "step": 138000 }, { "epoch": 8.077440486635082, "grad_norm": 0.38427284359931946, "learning_rate": 0.001, "loss": 2.0541, "step": 138100 }, { "epoch": 8.083289465988186, "grad_norm": 0.28418153524398804, "learning_rate": 0.001, "loss": 2.0574, "step": 138200 }, { "epoch": 8.089138445341288, "grad_norm": 0.3197179436683655, "learning_rate": 0.001, "loss": 2.0516, "step": 138300 }, { "epoch": 8.094987424694391, "grad_norm": 0.29946160316467285, "learning_rate": 0.001, "loss": 2.047, "step": 138400 }, { "epoch": 8.100836404047493, "grad_norm": 0.2739807069301605, "learning_rate": 0.001, "loss": 2.0531, "step": 138500 }, { "epoch": 8.106685383400597, "grad_norm": 0.234921395778656, "learning_rate": 0.001, "loss": 2.0538, "step": 138600 }, { "epoch": 8.1125343627537, "grad_norm": 0.24310696125030518, "learning_rate": 0.001, "loss": 2.0564, "step": 138700 }, { "epoch": 8.118383342106803, "grad_norm": 0.2138548493385315, "learning_rate": 0.001, "loss": 2.0487, "step": 138800 }, { "epoch": 8.124232321459905, "grad_norm": 0.22339290380477905, "learning_rate": 0.001, "loss": 2.0512, "step": 138900 }, { "epoch": 8.130081300813009, "grad_norm": 0.20625323057174683, "learning_rate": 0.001, "loss": 2.0464, "step": 139000 }, { "epoch": 8.13593028016611, "grad_norm": 0.26861050724983215, "learning_rate": 0.001, "loss": 2.0607, "step": 139100 }, { "epoch": 8.141779259519215, "grad_norm": 0.24481835961341858, "learning_rate": 0.001, "loss": 2.0542, "step": 139200 }, { "epoch": 8.147628238872317, "grad_norm": 0.23541222512722015, "learning_rate": 0.001, "loss": 2.0557, "step": 139300 }, { "epoch": 8.15347721822542, "grad_norm": 0.2522171437740326, "learning_rate": 0.001, "loss": 2.0528, "step": 139400 }, { "epoch": 8.159326197578523, "grad_norm": 0.4757980704307556, "learning_rate": 0.001, "loss": 2.0544, "step": 139500 }, { "epoch": 8.165175176931626, "grad_norm": 0.252836138010025, "learning_rate": 0.001, "loss": 2.0563, "step": 139600 }, { "epoch": 8.171024156284728, "grad_norm": 0.24438871443271637, "learning_rate": 0.001, "loss": 2.062, "step": 139700 }, { "epoch": 8.17687313563783, "grad_norm": 0.37034308910369873, "learning_rate": 0.001, "loss": 2.0512, "step": 139800 }, { "epoch": 8.182722114990934, "grad_norm": 0.391443133354187, "learning_rate": 0.001, "loss": 2.0553, "step": 139900 }, { "epoch": 8.188571094344036, "grad_norm": 0.35713285207748413, "learning_rate": 0.001, "loss": 2.0511, "step": 140000 }, { "epoch": 8.19442007369714, "grad_norm": 0.3225921392440796, "learning_rate": 0.001, "loss": 2.0519, "step": 140100 }, { "epoch": 8.200269053050242, "grad_norm": 0.2646682858467102, "learning_rate": 0.001, "loss": 2.0569, "step": 140200 }, { "epoch": 8.206118032403346, "grad_norm": 0.25522667169570923, "learning_rate": 0.001, "loss": 2.0527, "step": 140300 }, { "epoch": 8.211967011756448, "grad_norm": 0.31862378120422363, "learning_rate": 0.001, "loss": 2.057, "step": 140400 }, { "epoch": 8.217815991109552, "grad_norm": 0.2648943066596985, "learning_rate": 0.001, "loss": 2.0471, "step": 140500 }, { "epoch": 8.223664970462654, "grad_norm": 0.2854939103126526, "learning_rate": 0.001, "loss": 2.0526, "step": 140600 }, { "epoch": 8.229513949815757, "grad_norm": 0.3167048692703247, "learning_rate": 0.001, "loss": 2.0472, "step": 140700 }, { "epoch": 8.23536292916886, "grad_norm": 0.3133721947669983, "learning_rate": 0.001, "loss": 2.0562, "step": 140800 }, { "epoch": 8.241211908521963, "grad_norm": 0.24277810752391815, "learning_rate": 0.001, "loss": 2.0524, "step": 140900 }, { "epoch": 8.247060887875065, "grad_norm": 0.2730344533920288, "learning_rate": 0.001, "loss": 2.0538, "step": 141000 }, { "epoch": 8.252909867228169, "grad_norm": 0.2896372377872467, "learning_rate": 0.001, "loss": 2.0516, "step": 141100 }, { "epoch": 8.258758846581271, "grad_norm": 0.36351707577705383, "learning_rate": 0.001, "loss": 2.0569, "step": 141200 }, { "epoch": 8.264607825934375, "grad_norm": 0.25832024216651917, "learning_rate": 0.001, "loss": 2.0533, "step": 141300 }, { "epoch": 8.270456805287477, "grad_norm": 0.22894759476184845, "learning_rate": 0.001, "loss": 2.0442, "step": 141400 }, { "epoch": 8.27630578464058, "grad_norm": 0.26379749178886414, "learning_rate": 0.001, "loss": 2.0522, "step": 141500 }, { "epoch": 8.282154763993683, "grad_norm": 0.27133533358573914, "learning_rate": 0.001, "loss": 2.059, "step": 141600 }, { "epoch": 8.288003743346787, "grad_norm": 0.2734687030315399, "learning_rate": 0.001, "loss": 2.0544, "step": 141700 }, { "epoch": 8.293852722699889, "grad_norm": 0.29513102769851685, "learning_rate": 0.001, "loss": 2.054, "step": 141800 }, { "epoch": 8.299701702052992, "grad_norm": 0.25679054856300354, "learning_rate": 0.001, "loss": 2.0527, "step": 141900 }, { "epoch": 8.305550681406094, "grad_norm": 0.32384389638900757, "learning_rate": 0.001, "loss": 2.0501, "step": 142000 }, { "epoch": 8.311399660759198, "grad_norm": 0.29712721705436707, "learning_rate": 0.001, "loss": 2.0516, "step": 142100 }, { "epoch": 8.3172486401123, "grad_norm": 0.25218820571899414, "learning_rate": 0.001, "loss": 2.0635, "step": 142200 }, { "epoch": 8.323097619465404, "grad_norm": 0.30641674995422363, "learning_rate": 0.001, "loss": 2.0536, "step": 142300 }, { "epoch": 8.328946598818506, "grad_norm": 0.2663896381855011, "learning_rate": 0.001, "loss": 2.0552, "step": 142400 }, { "epoch": 8.33479557817161, "grad_norm": 0.2617698311805725, "learning_rate": 0.001, "loss": 2.0513, "step": 142500 }, { "epoch": 8.340644557524712, "grad_norm": 0.21475650370121002, "learning_rate": 0.001, "loss": 2.0479, "step": 142600 }, { "epoch": 8.346493536877816, "grad_norm": 0.1941496729850769, "learning_rate": 0.001, "loss": 2.0506, "step": 142700 }, { "epoch": 8.352342516230918, "grad_norm": 0.31050559878349304, "learning_rate": 0.001, "loss": 2.058, "step": 142800 }, { "epoch": 8.35819149558402, "grad_norm": 0.3195195496082306, "learning_rate": 0.001, "loss": 2.0609, "step": 142900 }, { "epoch": 8.364040474937124, "grad_norm": 0.27041521668434143, "learning_rate": 0.001, "loss": 2.049, "step": 143000 }, { "epoch": 8.369889454290226, "grad_norm": 0.24115604162216187, "learning_rate": 0.001, "loss": 2.0518, "step": 143100 }, { "epoch": 8.37573843364333, "grad_norm": 0.35818174481391907, "learning_rate": 0.001, "loss": 2.0515, "step": 143200 }, { "epoch": 8.381587412996431, "grad_norm": 0.24784985184669495, "learning_rate": 0.001, "loss": 2.0546, "step": 143300 }, { "epoch": 8.387436392349535, "grad_norm": 0.24907523393630981, "learning_rate": 0.001, "loss": 2.0586, "step": 143400 }, { "epoch": 8.393285371702637, "grad_norm": 0.29623153805732727, "learning_rate": 0.001, "loss": 2.053, "step": 143500 }, { "epoch": 8.399134351055741, "grad_norm": 0.24483725428581238, "learning_rate": 0.001, "loss": 2.0468, "step": 143600 }, { "epoch": 8.404983330408843, "grad_norm": 0.25821784138679504, "learning_rate": 0.001, "loss": 2.053, "step": 143700 }, { "epoch": 8.410832309761947, "grad_norm": 0.3092058300971985, "learning_rate": 0.001, "loss": 2.0503, "step": 143800 }, { "epoch": 8.416681289115049, "grad_norm": 0.2927704155445099, "learning_rate": 0.001, "loss": 2.0541, "step": 143900 }, { "epoch": 8.422530268468153, "grad_norm": 0.22104287147521973, "learning_rate": 0.001, "loss": 2.0542, "step": 144000 }, { "epoch": 8.428379247821255, "grad_norm": 0.26355165243148804, "learning_rate": 0.001, "loss": 2.0542, "step": 144100 }, { "epoch": 8.434228227174358, "grad_norm": 0.2580408453941345, "learning_rate": 0.001, "loss": 2.0516, "step": 144200 }, { "epoch": 8.44007720652746, "grad_norm": 0.239335298538208, "learning_rate": 0.001, "loss": 2.0529, "step": 144300 }, { "epoch": 8.445926185880564, "grad_norm": 0.3075728714466095, "learning_rate": 0.001, "loss": 2.0527, "step": 144400 }, { "epoch": 8.451775165233666, "grad_norm": 0.2588115930557251, "learning_rate": 0.001, "loss": 2.0556, "step": 144500 }, { "epoch": 8.45762414458677, "grad_norm": 0.21744589507579803, "learning_rate": 0.001, "loss": 2.054, "step": 144600 }, { "epoch": 8.463473123939872, "grad_norm": 0.3078683316707611, "learning_rate": 0.001, "loss": 2.051, "step": 144700 }, { "epoch": 8.469322103292976, "grad_norm": 0.33693137764930725, "learning_rate": 0.001, "loss": 2.0491, "step": 144800 }, { "epoch": 8.475171082646078, "grad_norm": 0.2874291241168976, "learning_rate": 0.001, "loss": 2.0534, "step": 144900 }, { "epoch": 8.481020061999182, "grad_norm": 0.3008229732513428, "learning_rate": 0.001, "loss": 2.0528, "step": 145000 }, { "epoch": 8.486869041352284, "grad_norm": 0.26314303278923035, "learning_rate": 0.001, "loss": 2.0517, "step": 145100 }, { "epoch": 8.492718020705388, "grad_norm": 0.2545531690120697, "learning_rate": 0.001, "loss": 2.0576, "step": 145200 }, { "epoch": 8.49856700005849, "grad_norm": 0.22867703437805176, "learning_rate": 0.001, "loss": 2.0556, "step": 145300 }, { "epoch": 8.504415979411593, "grad_norm": 0.27990397810935974, "learning_rate": 0.001, "loss": 2.0576, "step": 145400 }, { "epoch": 8.510264958764695, "grad_norm": 0.2585389018058777, "learning_rate": 0.001, "loss": 2.0522, "step": 145500 }, { "epoch": 8.5161139381178, "grad_norm": 0.2625741958618164, "learning_rate": 0.001, "loss": 2.0514, "step": 145600 }, { "epoch": 8.521962917470901, "grad_norm": 0.2857014834880829, "learning_rate": 0.001, "loss": 2.0601, "step": 145700 }, { "epoch": 8.527811896824005, "grad_norm": 0.32933104038238525, "learning_rate": 0.001, "loss": 2.0516, "step": 145800 }, { "epoch": 8.533660876177107, "grad_norm": 0.25192439556121826, "learning_rate": 0.001, "loss": 2.0574, "step": 145900 }, { "epoch": 8.53950985553021, "grad_norm": 0.3756955564022064, "learning_rate": 0.001, "loss": 2.048, "step": 146000 }, { "epoch": 8.545358834883313, "grad_norm": 0.38765546679496765, "learning_rate": 0.001, "loss": 2.0493, "step": 146100 }, { "epoch": 8.551207814236415, "grad_norm": 0.3161045014858246, "learning_rate": 0.001, "loss": 2.0502, "step": 146200 }, { "epoch": 8.557056793589519, "grad_norm": 0.28747138381004333, "learning_rate": 0.001, "loss": 2.0464, "step": 146300 }, { "epoch": 8.56290577294262, "grad_norm": 0.2586744725704193, "learning_rate": 0.001, "loss": 2.0506, "step": 146400 }, { "epoch": 8.568754752295725, "grad_norm": 0.23891842365264893, "learning_rate": 0.001, "loss": 2.0566, "step": 146500 }, { "epoch": 8.574603731648827, "grad_norm": 0.2646450400352478, "learning_rate": 0.001, "loss": 2.0501, "step": 146600 }, { "epoch": 8.58045271100193, "grad_norm": 0.2264687418937683, "learning_rate": 0.001, "loss": 2.0514, "step": 146700 }, { "epoch": 8.586301690355032, "grad_norm": 0.25246086716651917, "learning_rate": 0.001, "loss": 2.0529, "step": 146800 }, { "epoch": 8.592150669708136, "grad_norm": 0.34876713156700134, "learning_rate": 0.001, "loss": 2.0507, "step": 146900 }, { "epoch": 8.597999649061238, "grad_norm": 0.3646022081375122, "learning_rate": 0.001, "loss": 2.058, "step": 147000 }, { "epoch": 8.603848628414342, "grad_norm": 0.3094925284385681, "learning_rate": 0.001, "loss": 2.05, "step": 147100 }, { "epoch": 8.609697607767444, "grad_norm": 0.326036274433136, "learning_rate": 0.001, "loss": 2.0514, "step": 147200 }, { "epoch": 8.615546587120548, "grad_norm": 0.3190648555755615, "learning_rate": 0.001, "loss": 2.0535, "step": 147300 }, { "epoch": 8.62139556647365, "grad_norm": 0.25392529368400574, "learning_rate": 0.001, "loss": 2.0497, "step": 147400 }, { "epoch": 8.627244545826754, "grad_norm": 0.2594636380672455, "learning_rate": 0.001, "loss": 2.0553, "step": 147500 }, { "epoch": 8.633093525179856, "grad_norm": 0.3087707757949829, "learning_rate": 0.001, "loss": 2.0524, "step": 147600 }, { "epoch": 8.63894250453296, "grad_norm": 0.26607629656791687, "learning_rate": 0.001, "loss": 2.0511, "step": 147700 }, { "epoch": 8.644791483886062, "grad_norm": 0.3282102346420288, "learning_rate": 0.001, "loss": 2.048, "step": 147800 }, { "epoch": 8.650640463239165, "grad_norm": 0.2562240660190582, "learning_rate": 0.001, "loss": 2.0526, "step": 147900 }, { "epoch": 8.656489442592267, "grad_norm": 0.28479379415512085, "learning_rate": 0.001, "loss": 2.0482, "step": 148000 }, { "epoch": 8.662338421945371, "grad_norm": 0.33760637044906616, "learning_rate": 0.001, "loss": 2.0535, "step": 148100 }, { "epoch": 8.668187401298473, "grad_norm": 0.31754693388938904, "learning_rate": 0.001, "loss": 2.0538, "step": 148200 }, { "epoch": 8.674036380651577, "grad_norm": 0.3490239083766937, "learning_rate": 0.001, "loss": 2.0539, "step": 148300 }, { "epoch": 8.679885360004679, "grad_norm": 0.23816856741905212, "learning_rate": 0.001, "loss": 2.0455, "step": 148400 }, { "epoch": 8.685734339357783, "grad_norm": 0.4276590049266815, "learning_rate": 0.001, "loss": 2.0519, "step": 148500 }, { "epoch": 8.691583318710885, "grad_norm": 0.26247578859329224, "learning_rate": 0.001, "loss": 2.0444, "step": 148600 }, { "epoch": 8.697432298063989, "grad_norm": 0.2626059353351593, "learning_rate": 0.001, "loss": 2.0479, "step": 148700 }, { "epoch": 8.70328127741709, "grad_norm": 0.28943368792533875, "learning_rate": 0.001, "loss": 2.0467, "step": 148800 }, { "epoch": 8.709130256770194, "grad_norm": 0.3208303451538086, "learning_rate": 0.001, "loss": 2.0577, "step": 148900 }, { "epoch": 8.714979236123297, "grad_norm": 0.24343618750572205, "learning_rate": 0.001, "loss": 2.0486, "step": 149000 }, { "epoch": 8.720828215476399, "grad_norm": 0.2698855698108673, "learning_rate": 0.001, "loss": 2.0451, "step": 149100 }, { "epoch": 8.726677194829502, "grad_norm": 0.2107623815536499, "learning_rate": 0.001, "loss": 2.052, "step": 149200 }, { "epoch": 8.732526174182604, "grad_norm": 0.4212949275970459, "learning_rate": 0.001, "loss": 2.0479, "step": 149300 }, { "epoch": 8.738375153535708, "grad_norm": 0.2790066599845886, "learning_rate": 0.001, "loss": 2.0469, "step": 149400 }, { "epoch": 8.74422413288881, "grad_norm": 0.2954109013080597, "learning_rate": 0.001, "loss": 2.0521, "step": 149500 }, { "epoch": 8.750073112241914, "grad_norm": 0.3808009922504425, "learning_rate": 0.001, "loss": 2.0516, "step": 149600 }, { "epoch": 8.755922091595016, "grad_norm": 0.2496301829814911, "learning_rate": 0.001, "loss": 2.0473, "step": 149700 }, { "epoch": 8.76177107094812, "grad_norm": 0.27654582262039185, "learning_rate": 0.001, "loss": 2.0527, "step": 149800 }, { "epoch": 8.767620050301222, "grad_norm": 0.2881701588630676, "learning_rate": 0.001, "loss": 2.052, "step": 149900 }, { "epoch": 8.773469029654326, "grad_norm": 0.2456066608428955, "learning_rate": 0.001, "loss": 2.0506, "step": 150000 }, { "epoch": 8.773469029654326, "eval_ag_news_accuracy": 0.2340625, "eval_ag_news_bleu_score": 6.543084086155398, "eval_ag_news_bleu_score_sem": 0.4679742870619626, "eval_ag_news_emb_cos_sim": 0.7083446979522705, "eval_ag_news_emb_cos_sim_sem": 0.015485579147934914, "eval_ag_news_emb_top1_equal": 0.9375, "eval_ag_news_emb_top1_equal_sem": 0.02147948183119297, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.8317644596099854, "eval_ag_news_n_ngrams_match_1": 12.8984375, "eval_ag_news_n_ngrams_match_2": 3.7578125, "eval_ag_news_n_ngrams_match_3": 1.4296875, "eval_ag_news_num_pred_words": 45.3125, "eval_ag_news_num_true_words": 43.765625, "eval_ag_news_perplexity": 16.975386799561527, "eval_ag_news_pred_num_tokens": 71.15625, "eval_ag_news_rouge_score": 0.2766477633307015, "eval_ag_news_runtime": 36.5973, "eval_ag_news_samples_per_second": 13.662, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3144254441832824, "eval_ag_news_token_set_f1_sem": 0.009883760201671208, "eval_ag_news_token_set_precision": 0.29734307850046626, "eval_ag_news_token_set_recall": 0.34814745147798826, "eval_ag_news_true_num_tokens": 60.4453125, "step": 150000 }, { "epoch": 8.773469029654326, "eval_anthropic_toxic_prompts_accuracy": 0.10075, "eval_anthropic_toxic_prompts_bleu_score": 37.094483751868836, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.5185050227382866, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8668363094329834, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010551081039011478, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.09375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.025864720370468334, "eval_anthropic_toxic_prompts_loss": 1.3719878196716309, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.953125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.3515625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.3359375, "eval_anthropic_toxic_prompts_num_pred_words": 15.203125, "eval_anthropic_toxic_prompts_num_true_words": 15.234375, "eval_anthropic_toxic_prompts_perplexity": 3.9431812432776945, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.2421875, "eval_anthropic_toxic_prompts_rouge_score": 0.6388391056892138, "eval_anthropic_toxic_prompts_runtime": 29.5546, "eval_anthropic_toxic_prompts_samples_per_second": 16.918, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6565927566069699, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01967446759733237, "eval_anthropic_toxic_prompts_token_set_precision": 0.6555202707518483, "eval_anthropic_toxic_prompts_token_set_recall": 0.6621907707147974, "eval_anthropic_toxic_prompts_true_num_tokens": 18.6796875, "step": 150000 }, { "epoch": 8.773469029654326, "eval_arxiv_accuracy": 0.364953125, "eval_arxiv_bleu_score": 1.750468365567949, "eval_arxiv_bleu_score_sem": 0.15362146879649163, "eval_arxiv_emb_cos_sim": 0.4338107109069824, "eval_arxiv_emb_cos_sim_sem": 0.0185563862323761, "eval_arxiv_emb_top1_equal": 0.859375, "eval_arxiv_emb_top1_equal_sem": 0.03084755875170231, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.5070643424987793, "eval_arxiv_n_ngrams_match_1": 14.296875, "eval_arxiv_n_ngrams_match_2": 2.3828125, "eval_arxiv_n_ngrams_match_3": 0.4765625, "eval_arxiv_num_pred_words": 56.625, "eval_arxiv_num_true_words": 86.5078125, "eval_arxiv_perplexity": 33.350219114793646, "eval_arxiv_pred_num_tokens": 125.6640625, "eval_arxiv_rouge_score": 0.185945587860452, "eval_arxiv_runtime": 30.7193, "eval_arxiv_samples_per_second": 16.276, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.17524388895429868, "eval_arxiv_token_set_f1_sem": 0.008542942587940363, "eval_arxiv_token_set_precision": 0.11815904362223373, "eval_arxiv_token_set_recall": 0.41171675938381863, "eval_arxiv_true_num_tokens": 125.59375, "step": 150000 }, { "epoch": 8.773469029654326, "eval_python_code_alpaca_accuracy": 0.1255, "eval_python_code_alpaca_bleu_score": 23.7297657075811, "eval_python_code_alpaca_bleu_score_sem": 1.564749373338586, "eval_python_code_alpaca_emb_cos_sim": 0.8407459855079651, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011009645648300648, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.5981696844100952, "eval_python_code_alpaca_n_ngrams_match_1": 10.46875, "eval_python_code_alpaca_n_ngrams_match_2": 5.375, "eval_python_code_alpaca_n_ngrams_match_3": 2.78125, "eval_python_code_alpaca_num_pred_words": 19.671875, "eval_python_code_alpaca_num_true_words": 20.421875, "eval_python_code_alpaca_perplexity": 4.943975103337951, "eval_python_code_alpaca_pred_num_tokens": 27.046875, "eval_python_code_alpaca_rouge_score": 0.5561869945083138, "eval_python_code_alpaca_runtime": 30.8813, "eval_python_code_alpaca_samples_per_second": 16.191, "eval_python_code_alpaca_steps_per_second": 0.032, "eval_python_code_alpaca_token_set_f1": 0.5796925814528043, "eval_python_code_alpaca_token_set_f1_sem": 0.013846340451071429, "eval_python_code_alpaca_token_set_precision": 0.5596178925382026, "eval_python_code_alpaca_token_set_recall": 0.6089281217496995, "eval_python_code_alpaca_true_num_tokens": 26.4921875, "step": 150000 }, { "epoch": 8.773469029654326, "eval_wikibio_accuracy": 0.351875, "eval_wikibio_bleu_score": 7.454439246155504, "eval_wikibio_bleu_score_sem": 0.7549415410850874, "eval_wikibio_emb_cos_sim": 0.5752383470535278, "eval_wikibio_emb_cos_sim_sem": 0.02392849139869213, "eval_wikibio_emb_top1_equal": 0.8828125, "eval_wikibio_emb_top1_equal_sem": 0.02854125387966633, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.801637649536133, "eval_wikibio_n_ngrams_match_1": 13.921875, "eval_wikibio_n_ngrams_match_2": 4.90625, "eval_wikibio_n_ngrams_match_3": 2.140625, "eval_wikibio_num_pred_words": 50.3515625, "eval_wikibio_num_true_words": 51.8125, "eval_wikibio_perplexity": 16.471599402712258, "eval_wikibio_pred_num_tokens": 105.125, "eval_wikibio_rouge_score": 0.2736638150527463, "eval_wikibio_runtime": 30.4423, "eval_wikibio_samples_per_second": 16.425, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2949490124502468, "eval_wikibio_token_set_f1_sem": 0.014326654074106783, "eval_wikibio_token_set_precision": 0.26402473838016666, "eval_wikibio_token_set_recall": 0.38159079939080376, "eval_wikibio_true_num_tokens": 99.3046875, "step": 150000 }, { "epoch": 8.773469029654326, "eval_msmarco_accuracy": 0.373203125, "eval_msmarco_bleu_score": 11.490527819855732, "eval_msmarco_bleu_score_sem": 0.9292125668859211, "eval_msmarco_emb_cos_sim": 0.7165483236312866, "eval_msmarco_emb_cos_sim_sem": 0.01812005043029785, "eval_msmarco_emb_top1_equal": 0.8984375, "eval_msmarco_emb_top1_equal_sem": 0.026804566383361816, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.9380462169647217, "eval_msmarco_n_ngrams_match_1": 25.953125, "eval_msmarco_n_ngrams_match_2": 10.265625, "eval_msmarco_n_ngrams_match_3": 4.796875, "eval_msmarco_num_pred_words": 64.015625, "eval_msmarco_num_true_words": 63.3984375, "eval_msmarco_perplexity": 6.945168354107697, "eval_msmarco_pred_num_tokens": 89.171875, "eval_msmarco_rouge_score": 0.3845628930871683, "eval_msmarco_runtime": 25.2306, "eval_msmarco_samples_per_second": 19.817, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.41862928073289124, "eval_msmarco_token_set_f1_sem": 0.011368636797487966, "eval_msmarco_token_set_precision": 0.37450308919380254, "eval_msmarco_token_set_recall": 0.49290727532476425, "eval_msmarco_true_num_tokens": 82.2578125, "step": 150000 }, { "epoch": 8.779318009007428, "grad_norm": 0.2825964689254761, "learning_rate": 0.001, "loss": 2.0535, "step": 150100 }, { "epoch": 8.785166988360531, "grad_norm": 0.2506527304649353, "learning_rate": 0.001, "loss": 2.0513, "step": 150200 }, { "epoch": 8.791015967713633, "grad_norm": 0.25880351662635803, "learning_rate": 0.001, "loss": 2.0541, "step": 150300 }, { "epoch": 8.796864947066737, "grad_norm": 0.3221510350704193, "learning_rate": 0.001, "loss": 2.0558, "step": 150400 }, { "epoch": 8.80271392641984, "grad_norm": 0.28172996640205383, "learning_rate": 0.001, "loss": 2.0451, "step": 150500 }, { "epoch": 8.808562905772943, "grad_norm": 0.24907757341861725, "learning_rate": 0.001, "loss": 2.052, "step": 150600 }, { "epoch": 8.814411885126045, "grad_norm": 0.26681485772132874, "learning_rate": 0.001, "loss": 2.0537, "step": 150700 }, { "epoch": 8.820260864479149, "grad_norm": 0.20914389193058014, "learning_rate": 0.001, "loss": 2.0526, "step": 150800 }, { "epoch": 8.826109843832251, "grad_norm": 0.23438094556331635, "learning_rate": 0.001, "loss": 2.0458, "step": 150900 }, { "epoch": 8.831958823185355, "grad_norm": 0.34047195315361023, "learning_rate": 0.001, "loss": 2.0484, "step": 151000 }, { "epoch": 8.837807802538457, "grad_norm": 0.2627622187137604, "learning_rate": 0.001, "loss": 2.0496, "step": 151100 }, { "epoch": 8.84365678189156, "grad_norm": 0.3006042540073395, "learning_rate": 0.001, "loss": 2.0504, "step": 151200 }, { "epoch": 8.849505761244663, "grad_norm": 0.3391090929508209, "learning_rate": 0.001, "loss": 2.0537, "step": 151300 }, { "epoch": 8.855354740597766, "grad_norm": 0.2755794823169708, "learning_rate": 0.001, "loss": 2.0487, "step": 151400 }, { "epoch": 8.861203719950868, "grad_norm": 0.23251065611839294, "learning_rate": 0.001, "loss": 2.043, "step": 151500 }, { "epoch": 8.867052699303972, "grad_norm": 0.2344355434179306, "learning_rate": 0.001, "loss": 2.0512, "step": 151600 }, { "epoch": 8.872901678657074, "grad_norm": 0.3166691064834595, "learning_rate": 0.001, "loss": 2.0486, "step": 151700 }, { "epoch": 8.878750658010178, "grad_norm": 0.2187194526195526, "learning_rate": 0.001, "loss": 2.0493, "step": 151800 }, { "epoch": 8.88459963736328, "grad_norm": 0.3117513060569763, "learning_rate": 0.001, "loss": 2.043, "step": 151900 }, { "epoch": 8.890448616716384, "grad_norm": 0.24717245995998383, "learning_rate": 0.001, "loss": 2.0487, "step": 152000 }, { "epoch": 8.896297596069486, "grad_norm": 0.2561193108558655, "learning_rate": 0.001, "loss": 2.0416, "step": 152100 }, { "epoch": 8.902146575422588, "grad_norm": 0.326510488986969, "learning_rate": 0.001, "loss": 2.0596, "step": 152200 }, { "epoch": 8.907995554775692, "grad_norm": 0.2627631425857544, "learning_rate": 0.001, "loss": 2.05, "step": 152300 }, { "epoch": 8.913844534128794, "grad_norm": 0.25266319513320923, "learning_rate": 0.001, "loss": 2.0396, "step": 152400 }, { "epoch": 8.919693513481898, "grad_norm": 0.2523305416107178, "learning_rate": 0.001, "loss": 2.0509, "step": 152500 }, { "epoch": 8.925542492835, "grad_norm": 0.2340974658727646, "learning_rate": 0.001, "loss": 2.0458, "step": 152600 }, { "epoch": 8.931391472188103, "grad_norm": 0.2976679503917694, "learning_rate": 0.001, "loss": 2.0468, "step": 152700 }, { "epoch": 8.937240451541205, "grad_norm": 0.4015559256076813, "learning_rate": 0.001, "loss": 2.0482, "step": 152800 }, { "epoch": 8.94308943089431, "grad_norm": 0.2554238438606262, "learning_rate": 0.001, "loss": 2.0518, "step": 152900 }, { "epoch": 8.948938410247411, "grad_norm": 0.2403808832168579, "learning_rate": 0.001, "loss": 2.0513, "step": 153000 }, { "epoch": 8.954787389600515, "grad_norm": 0.28589460253715515, "learning_rate": 0.001, "loss": 2.0514, "step": 153100 }, { "epoch": 8.960636368953617, "grad_norm": 0.25179237127304077, "learning_rate": 0.001, "loss": 2.051, "step": 153200 }, { "epoch": 8.96648534830672, "grad_norm": 0.2831677794456482, "learning_rate": 0.001, "loss": 2.0464, "step": 153300 }, { "epoch": 8.972334327659823, "grad_norm": 0.2126781940460205, "learning_rate": 0.001, "loss": 2.0523, "step": 153400 }, { "epoch": 8.978183307012927, "grad_norm": 0.27988240122795105, "learning_rate": 0.001, "loss": 2.0489, "step": 153500 }, { "epoch": 8.984032286366029, "grad_norm": 0.3193823993206024, "learning_rate": 0.001, "loss": 2.0489, "step": 153600 }, { "epoch": 8.989881265719132, "grad_norm": 0.3461087644100189, "learning_rate": 0.001, "loss": 2.0544, "step": 153700 }, { "epoch": 8.995730245072235, "grad_norm": 0.3058323264122009, "learning_rate": 0.001, "loss": 2.0511, "step": 153800 }, { "epoch": 9.001579224425338, "grad_norm": 0.2959921658039093, "learning_rate": 0.001, "loss": 2.0448, "step": 153900 }, { "epoch": 9.00742820377844, "grad_norm": 0.24591955542564392, "learning_rate": 0.001, "loss": 2.0337, "step": 154000 }, { "epoch": 9.013277183131544, "grad_norm": 0.16780972480773926, "learning_rate": 0.001, "loss": 2.0301, "step": 154100 }, { "epoch": 9.019126162484646, "grad_norm": 0.22066181898117065, "learning_rate": 0.001, "loss": 2.0307, "step": 154200 }, { "epoch": 9.02497514183775, "grad_norm": 0.19532674551010132, "learning_rate": 0.001, "loss": 2.0315, "step": 154300 }, { "epoch": 9.030824121190852, "grad_norm": 0.3128432631492615, "learning_rate": 0.001, "loss": 2.0357, "step": 154400 }, { "epoch": 9.036673100543956, "grad_norm": 0.27528783679008484, "learning_rate": 0.001, "loss": 2.0308, "step": 154500 }, { "epoch": 9.042522079897058, "grad_norm": 0.23309800028800964, "learning_rate": 0.001, "loss": 2.0323, "step": 154600 }, { "epoch": 9.048371059250162, "grad_norm": 0.30240294337272644, "learning_rate": 0.001, "loss": 2.0357, "step": 154700 }, { "epoch": 9.054220038603264, "grad_norm": 0.24562114477157593, "learning_rate": 0.001, "loss": 2.0339, "step": 154800 }, { "epoch": 9.060069017956367, "grad_norm": 0.2223876565694809, "learning_rate": 0.001, "loss": 2.0278, "step": 154900 }, { "epoch": 9.06591799730947, "grad_norm": 0.2961469888687134, "learning_rate": 0.001, "loss": 2.0323, "step": 155000 }, { "epoch": 9.071766976662573, "grad_norm": 0.29214558005332947, "learning_rate": 0.001, "loss": 2.0319, "step": 155100 }, { "epoch": 9.077615956015675, "grad_norm": 0.1990915685892105, "learning_rate": 0.001, "loss": 2.0342, "step": 155200 }, { "epoch": 9.083464935368777, "grad_norm": 0.28852370381355286, "learning_rate": 0.001, "loss": 2.0331, "step": 155300 }, { "epoch": 9.089313914721881, "grad_norm": 0.29607099294662476, "learning_rate": 0.001, "loss": 2.0379, "step": 155400 }, { "epoch": 9.095162894074983, "grad_norm": 0.2689138948917389, "learning_rate": 0.001, "loss": 2.0332, "step": 155500 }, { "epoch": 9.101011873428087, "grad_norm": 0.26251867413520813, "learning_rate": 0.001, "loss": 2.0334, "step": 155600 }, { "epoch": 9.106860852781189, "grad_norm": 0.23764045536518097, "learning_rate": 0.001, "loss": 2.0332, "step": 155700 }, { "epoch": 9.112709832134293, "grad_norm": 0.2855386435985565, "learning_rate": 0.001, "loss": 2.0259, "step": 155800 }, { "epoch": 9.118558811487395, "grad_norm": 0.2567245662212372, "learning_rate": 0.001, "loss": 2.0381, "step": 155900 }, { "epoch": 9.124407790840499, "grad_norm": 0.3116306662559509, "learning_rate": 0.001, "loss": 2.0322, "step": 156000 }, { "epoch": 9.1302567701936, "grad_norm": 0.2775784730911255, "learning_rate": 0.001, "loss": 2.0279, "step": 156100 }, { "epoch": 9.136105749546704, "grad_norm": 0.18819324672222137, "learning_rate": 0.001, "loss": 2.0284, "step": 156200 }, { "epoch": 9.141954728899806, "grad_norm": 0.2461482584476471, "learning_rate": 0.001, "loss": 2.0327, "step": 156300 }, { "epoch": 9.14780370825291, "grad_norm": 0.32233667373657227, "learning_rate": 0.001, "loss": 2.0345, "step": 156400 }, { "epoch": 9.153652687606012, "grad_norm": 0.25237587094306946, "learning_rate": 0.001, "loss": 2.0403, "step": 156500 }, { "epoch": 9.159501666959116, "grad_norm": 0.2369132786989212, "learning_rate": 0.001, "loss": 2.0355, "step": 156600 }, { "epoch": 9.165350646312218, "grad_norm": 0.24383118748664856, "learning_rate": 0.001, "loss": 2.033, "step": 156700 }, { "epoch": 9.171199625665322, "grad_norm": 0.3187243640422821, "learning_rate": 0.001, "loss": 2.0399, "step": 156800 }, { "epoch": 9.177048605018424, "grad_norm": 0.22270141541957855, "learning_rate": 0.001, "loss": 2.038, "step": 156900 }, { "epoch": 9.182897584371528, "grad_norm": 0.18884705007076263, "learning_rate": 0.001, "loss": 2.0333, "step": 157000 }, { "epoch": 9.18874656372463, "grad_norm": 0.18620070815086365, "learning_rate": 0.001, "loss": 2.028, "step": 157100 }, { "epoch": 9.194595543077734, "grad_norm": 0.2736113667488098, "learning_rate": 0.001, "loss": 2.0352, "step": 157200 }, { "epoch": 9.200444522430836, "grad_norm": 0.19833335280418396, "learning_rate": 0.001, "loss": 2.038, "step": 157300 }, { "epoch": 9.20629350178394, "grad_norm": 0.3165561556816101, "learning_rate": 0.001, "loss": 2.0396, "step": 157400 }, { "epoch": 9.212142481137041, "grad_norm": 0.3856927454471588, "learning_rate": 0.001, "loss": 2.0368, "step": 157500 }, { "epoch": 9.217991460490145, "grad_norm": 0.24716055393218994, "learning_rate": 0.001, "loss": 2.0402, "step": 157600 }, { "epoch": 9.223840439843247, "grad_norm": 0.24186432361602783, "learning_rate": 0.001, "loss": 2.0415, "step": 157700 }, { "epoch": 9.229689419196351, "grad_norm": 0.27013206481933594, "learning_rate": 0.001, "loss": 2.0388, "step": 157800 }, { "epoch": 9.235538398549453, "grad_norm": 0.26048728823661804, "learning_rate": 0.001, "loss": 2.0363, "step": 157900 }, { "epoch": 9.241387377902557, "grad_norm": 0.29677748680114746, "learning_rate": 0.001, "loss": 2.0332, "step": 158000 }, { "epoch": 9.247236357255659, "grad_norm": 0.2656020224094391, "learning_rate": 0.001, "loss": 2.0357, "step": 158100 }, { "epoch": 9.253085336608763, "grad_norm": 0.30814477801322937, "learning_rate": 0.001, "loss": 2.0352, "step": 158200 }, { "epoch": 9.258934315961865, "grad_norm": 0.258783757686615, "learning_rate": 0.001, "loss": 2.0352, "step": 158300 }, { "epoch": 9.264783295314967, "grad_norm": 0.30140233039855957, "learning_rate": 0.001, "loss": 2.0294, "step": 158400 }, { "epoch": 9.27063227466807, "grad_norm": 0.23723970353603363, "learning_rate": 0.001, "loss": 2.0307, "step": 158500 }, { "epoch": 9.276481254021173, "grad_norm": 0.31492337584495544, "learning_rate": 0.001, "loss": 2.0366, "step": 158600 }, { "epoch": 9.282330233374276, "grad_norm": 0.28039777278900146, "learning_rate": 0.001, "loss": 2.0395, "step": 158700 }, { "epoch": 9.288179212727378, "grad_norm": 0.24066326022148132, "learning_rate": 0.001, "loss": 2.0404, "step": 158800 }, { "epoch": 9.294028192080482, "grad_norm": 0.237651064991951, "learning_rate": 0.001, "loss": 2.0371, "step": 158900 }, { "epoch": 9.299877171433584, "grad_norm": 0.21849972009658813, "learning_rate": 0.001, "loss": 2.0377, "step": 159000 }, { "epoch": 9.305726150786688, "grad_norm": 0.22698575258255005, "learning_rate": 0.001, "loss": 2.03, "step": 159100 }, { "epoch": 9.31157513013979, "grad_norm": 0.25280654430389404, "learning_rate": 0.001, "loss": 2.0324, "step": 159200 }, { "epoch": 9.317424109492894, "grad_norm": 0.2667643427848816, "learning_rate": 0.001, "loss": 2.0298, "step": 159300 }, { "epoch": 9.323273088845996, "grad_norm": 0.2121478021144867, "learning_rate": 0.001, "loss": 2.0353, "step": 159400 }, { "epoch": 9.3291220681991, "grad_norm": 0.22061362862586975, "learning_rate": 0.001, "loss": 2.0371, "step": 159500 }, { "epoch": 9.334971047552202, "grad_norm": 0.23518681526184082, "learning_rate": 0.001, "loss": 2.0298, "step": 159600 }, { "epoch": 9.340820026905305, "grad_norm": 0.30270805954933167, "learning_rate": 0.001, "loss": 2.0402, "step": 159700 }, { "epoch": 9.346669006258407, "grad_norm": 0.30616897344589233, "learning_rate": 0.001, "loss": 2.033, "step": 159800 }, { "epoch": 9.352517985611511, "grad_norm": 0.23180386424064636, "learning_rate": 0.001, "loss": 2.0364, "step": 159900 }, { "epoch": 9.358366964964613, "grad_norm": 0.18928752839565277, "learning_rate": 0.001, "loss": 2.0306, "step": 160000 }, { "epoch": 9.364215944317717, "grad_norm": 0.2677852213382721, "learning_rate": 0.001, "loss": 2.0331, "step": 160100 }, { "epoch": 9.37006492367082, "grad_norm": 0.27995216846466064, "learning_rate": 0.001, "loss": 2.0354, "step": 160200 }, { "epoch": 9.375913903023923, "grad_norm": 0.2641223073005676, "learning_rate": 0.001, "loss": 2.0316, "step": 160300 }, { "epoch": 9.381762882377025, "grad_norm": 0.26066839694976807, "learning_rate": 0.001, "loss": 2.0316, "step": 160400 }, { "epoch": 9.387611861730129, "grad_norm": 0.37251099944114685, "learning_rate": 0.001, "loss": 2.0362, "step": 160500 }, { "epoch": 9.39346084108323, "grad_norm": 0.22420814633369446, "learning_rate": 0.001, "loss": 2.0349, "step": 160600 }, { "epoch": 9.399309820436335, "grad_norm": 0.2669471204280853, "learning_rate": 0.001, "loss": 2.038, "step": 160700 }, { "epoch": 9.405158799789437, "grad_norm": 0.25145766139030457, "learning_rate": 0.001, "loss": 2.0306, "step": 160800 }, { "epoch": 9.41100777914254, "grad_norm": 0.28328484296798706, "learning_rate": 0.001, "loss": 2.0373, "step": 160900 }, { "epoch": 9.416856758495642, "grad_norm": 0.19915758073329926, "learning_rate": 0.001, "loss": 2.0338, "step": 161000 }, { "epoch": 9.422705737848746, "grad_norm": 0.30612897872924805, "learning_rate": 0.001, "loss": 2.0389, "step": 161100 }, { "epoch": 9.428554717201848, "grad_norm": 0.2006330043077469, "learning_rate": 0.001, "loss": 2.0396, "step": 161200 }, { "epoch": 9.434403696554952, "grad_norm": 0.24797473847866058, "learning_rate": 0.001, "loss": 2.0371, "step": 161300 }, { "epoch": 9.440252675908054, "grad_norm": 0.2486206442117691, "learning_rate": 0.001, "loss": 2.0341, "step": 161400 }, { "epoch": 9.446101655261156, "grad_norm": 0.16843076050281525, "learning_rate": 0.001, "loss": 2.0303, "step": 161500 }, { "epoch": 9.45195063461426, "grad_norm": 0.42661377787590027, "learning_rate": 0.001, "loss": 2.0357, "step": 161600 }, { "epoch": 9.457799613967362, "grad_norm": 0.21401609480381012, "learning_rate": 0.001, "loss": 2.0347, "step": 161700 }, { "epoch": 9.463648593320466, "grad_norm": 0.2553645670413971, "learning_rate": 0.001, "loss": 2.0382, "step": 161800 }, { "epoch": 9.469497572673568, "grad_norm": 0.16838331520557404, "learning_rate": 0.001, "loss": 2.031, "step": 161900 }, { "epoch": 9.475346552026672, "grad_norm": 0.24693629145622253, "learning_rate": 0.001, "loss": 2.0326, "step": 162000 }, { "epoch": 9.481195531379774, "grad_norm": 0.2836306691169739, "learning_rate": 0.001, "loss": 2.0316, "step": 162100 }, { "epoch": 9.487044510732877, "grad_norm": 0.2127094566822052, "learning_rate": 0.001, "loss": 2.036, "step": 162200 }, { "epoch": 9.49289349008598, "grad_norm": 0.20564542710781097, "learning_rate": 0.001, "loss": 2.0258, "step": 162300 }, { "epoch": 9.498742469439083, "grad_norm": 0.26087310910224915, "learning_rate": 0.001, "loss": 2.035, "step": 162400 }, { "epoch": 9.504591448792185, "grad_norm": 0.2260916829109192, "learning_rate": 0.001, "loss": 2.0344, "step": 162500 }, { "epoch": 9.510440428145289, "grad_norm": 0.184107705950737, "learning_rate": 0.001, "loss": 2.0364, "step": 162600 }, { "epoch": 9.516289407498391, "grad_norm": 0.2765580117702484, "learning_rate": 0.001, "loss": 2.0333, "step": 162700 }, { "epoch": 9.522138386851495, "grad_norm": 0.2933179438114166, "learning_rate": 0.001, "loss": 2.0374, "step": 162800 }, { "epoch": 9.527987366204597, "grad_norm": 0.22573572397232056, "learning_rate": 0.001, "loss": 2.0349, "step": 162900 }, { "epoch": 9.5338363455577, "grad_norm": 0.22123439610004425, "learning_rate": 0.001, "loss": 2.0333, "step": 163000 }, { "epoch": 9.539685324910803, "grad_norm": 0.31800583004951477, "learning_rate": 0.001, "loss": 2.0348, "step": 163100 }, { "epoch": 9.545534304263906, "grad_norm": 0.3488694131374359, "learning_rate": 0.001, "loss": 2.0306, "step": 163200 }, { "epoch": 9.551383283617009, "grad_norm": 0.3410300612449646, "learning_rate": 0.001, "loss": 2.034, "step": 163300 }, { "epoch": 9.557232262970112, "grad_norm": 0.2346968799829483, "learning_rate": 0.001, "loss": 2.031, "step": 163400 }, { "epoch": 9.563081242323214, "grad_norm": 0.29236361384391785, "learning_rate": 0.001, "loss": 2.036, "step": 163500 }, { "epoch": 9.568930221676318, "grad_norm": 0.27073583006858826, "learning_rate": 0.001, "loss": 2.0399, "step": 163600 }, { "epoch": 9.57477920102942, "grad_norm": 0.20399492979049683, "learning_rate": 0.001, "loss": 2.0366, "step": 163700 }, { "epoch": 9.580628180382524, "grad_norm": 0.18849824368953705, "learning_rate": 0.001, "loss": 2.0372, "step": 163800 }, { "epoch": 9.586477159735626, "grad_norm": 0.237285315990448, "learning_rate": 0.001, "loss": 2.0317, "step": 163900 }, { "epoch": 9.59232613908873, "grad_norm": 0.2073872685432434, "learning_rate": 0.001, "loss": 2.0349, "step": 164000 }, { "epoch": 9.598175118441832, "grad_norm": 0.23518729209899902, "learning_rate": 0.001, "loss": 2.0325, "step": 164100 }, { "epoch": 9.604024097794936, "grad_norm": 0.280071496963501, "learning_rate": 0.001, "loss": 2.0359, "step": 164200 }, { "epoch": 9.609873077148038, "grad_norm": 0.21178285777568817, "learning_rate": 0.001, "loss": 2.0313, "step": 164300 }, { "epoch": 9.615722056501141, "grad_norm": 0.29683855175971985, "learning_rate": 0.001, "loss": 2.0318, "step": 164400 }, { "epoch": 9.621571035854243, "grad_norm": 0.18460041284561157, "learning_rate": 0.001, "loss": 2.0321, "step": 164500 }, { "epoch": 9.627420015207345, "grad_norm": 0.17134425044059753, "learning_rate": 0.001, "loss": 2.034, "step": 164600 }, { "epoch": 9.63326899456045, "grad_norm": 0.23944851756095886, "learning_rate": 0.001, "loss": 2.0347, "step": 164700 }, { "epoch": 9.639117973913551, "grad_norm": 0.1878364086151123, "learning_rate": 0.001, "loss": 2.0347, "step": 164800 }, { "epoch": 9.644966953266655, "grad_norm": 0.2547733187675476, "learning_rate": 0.001, "loss": 2.0371, "step": 164900 }, { "epoch": 9.650815932619757, "grad_norm": 0.367452472448349, "learning_rate": 0.001, "loss": 2.0379, "step": 165000 }, { "epoch": 9.656664911972861, "grad_norm": 0.24134086072444916, "learning_rate": 0.001, "loss": 2.036, "step": 165100 }, { "epoch": 9.662513891325963, "grad_norm": 0.31158921122550964, "learning_rate": 0.001, "loss": 2.0361, "step": 165200 }, { "epoch": 9.668362870679067, "grad_norm": 0.23723387718200684, "learning_rate": 0.001, "loss": 2.0319, "step": 165300 }, { "epoch": 9.674211850032169, "grad_norm": 0.21048542857170105, "learning_rate": 0.001, "loss": 2.0386, "step": 165400 }, { "epoch": 9.680060829385273, "grad_norm": 0.2461100071668625, "learning_rate": 0.001, "loss": 2.0337, "step": 165500 }, { "epoch": 9.685909808738375, "grad_norm": 0.3699190616607666, "learning_rate": 0.001, "loss": 2.0314, "step": 165600 }, { "epoch": 9.691758788091478, "grad_norm": 0.25351476669311523, "learning_rate": 0.001, "loss": 2.0288, "step": 165700 }, { "epoch": 9.69760776744458, "grad_norm": 0.19149437546730042, "learning_rate": 0.001, "loss": 2.0287, "step": 165800 }, { "epoch": 9.703456746797684, "grad_norm": 0.2228778600692749, "learning_rate": 0.001, "loss": 2.0289, "step": 165900 }, { "epoch": 9.709305726150786, "grad_norm": 0.17589032649993896, "learning_rate": 0.001, "loss": 2.0327, "step": 166000 }, { "epoch": 9.71515470550389, "grad_norm": 0.27442651987075806, "learning_rate": 0.001, "loss": 2.0347, "step": 166100 }, { "epoch": 9.721003684856992, "grad_norm": 0.25203439593315125, "learning_rate": 0.001, "loss": 2.0368, "step": 166200 }, { "epoch": 9.726852664210096, "grad_norm": 0.35635894536972046, "learning_rate": 0.001, "loss": 2.0289, "step": 166300 }, { "epoch": 9.732701643563198, "grad_norm": 0.21681372821331024, "learning_rate": 0.001, "loss": 2.0338, "step": 166400 }, { "epoch": 9.738550622916302, "grad_norm": 0.2901915907859802, "learning_rate": 0.001, "loss": 2.0338, "step": 166500 }, { "epoch": 9.744399602269404, "grad_norm": 0.20905114710330963, "learning_rate": 0.001, "loss": 2.0313, "step": 166600 }, { "epoch": 9.750248581622508, "grad_norm": 0.22842802107334137, "learning_rate": 0.001, "loss": 2.0319, "step": 166700 }, { "epoch": 9.75609756097561, "grad_norm": 0.21534320712089539, "learning_rate": 0.001, "loss": 2.0349, "step": 166800 }, { "epoch": 9.761946540328713, "grad_norm": 0.2837488055229187, "learning_rate": 0.001, "loss": 2.0383, "step": 166900 }, { "epoch": 9.767795519681815, "grad_norm": 0.1845802366733551, "learning_rate": 0.001, "loss": 2.0292, "step": 167000 }, { "epoch": 9.77364449903492, "grad_norm": 0.2700027823448181, "learning_rate": 0.001, "loss": 2.0309, "step": 167100 }, { "epoch": 9.779493478388021, "grad_norm": 0.2118852436542511, "learning_rate": 0.001, "loss": 2.035, "step": 167200 }, { "epoch": 9.785342457741125, "grad_norm": 0.2572864294052124, "learning_rate": 0.001, "loss": 2.0306, "step": 167300 }, { "epoch": 9.791191437094227, "grad_norm": 0.2283550202846527, "learning_rate": 0.001, "loss": 2.0345, "step": 167400 }, { "epoch": 9.79704041644733, "grad_norm": 0.31894946098327637, "learning_rate": 0.001, "loss": 2.0372, "step": 167500 }, { "epoch": 9.802889395800433, "grad_norm": 0.25806519389152527, "learning_rate": 0.001, "loss": 2.0379, "step": 167600 }, { "epoch": 9.808738375153535, "grad_norm": 0.22051940858364105, "learning_rate": 0.001, "loss": 2.0328, "step": 167700 }, { "epoch": 9.814587354506639, "grad_norm": 0.27118876576423645, "learning_rate": 0.001, "loss": 2.0329, "step": 167800 }, { "epoch": 9.82043633385974, "grad_norm": 0.19263893365859985, "learning_rate": 0.001, "loss": 2.0311, "step": 167900 }, { "epoch": 9.826285313212844, "grad_norm": 0.23989814519882202, "learning_rate": 0.001, "loss": 2.0271, "step": 168000 }, { "epoch": 9.832134292565947, "grad_norm": 0.22814209759235382, "learning_rate": 0.001, "loss": 2.0368, "step": 168100 }, { "epoch": 9.83798327191905, "grad_norm": 0.25933459401130676, "learning_rate": 0.001, "loss": 2.0344, "step": 168200 }, { "epoch": 9.843832251272152, "grad_norm": 0.20532174408435822, "learning_rate": 0.001, "loss": 2.0302, "step": 168300 }, { "epoch": 9.849681230625256, "grad_norm": 0.25750061869621277, "learning_rate": 0.001, "loss": 2.0347, "step": 168400 }, { "epoch": 9.855530209978358, "grad_norm": 0.22548291087150574, "learning_rate": 0.001, "loss": 2.0345, "step": 168500 }, { "epoch": 9.861379189331462, "grad_norm": 0.21954746544361115, "learning_rate": 0.001, "loss": 2.0343, "step": 168600 }, { "epoch": 9.867228168684564, "grad_norm": 0.3634682595729828, "learning_rate": 0.001, "loss": 2.0306, "step": 168700 }, { "epoch": 9.873077148037668, "grad_norm": 0.24717816710472107, "learning_rate": 0.001, "loss": 2.0294, "step": 168800 }, { "epoch": 9.87892612739077, "grad_norm": 0.1872510313987732, "learning_rate": 0.001, "loss": 2.0324, "step": 168900 }, { "epoch": 9.884775106743874, "grad_norm": 0.20392240583896637, "learning_rate": 0.001, "loss": 2.0296, "step": 169000 }, { "epoch": 9.890624086096976, "grad_norm": 0.2665574252605438, "learning_rate": 0.001, "loss": 2.028, "step": 169100 }, { "epoch": 9.89647306545008, "grad_norm": 0.17171132564544678, "learning_rate": 0.001, "loss": 2.0333, "step": 169200 }, { "epoch": 9.902322044803181, "grad_norm": 0.19852541387081146, "learning_rate": 0.001, "loss": 2.0353, "step": 169300 }, { "epoch": 9.908171024156285, "grad_norm": 0.23423729836940765, "learning_rate": 0.001, "loss": 2.0324, "step": 169400 }, { "epoch": 9.914020003509387, "grad_norm": 0.2906620502471924, "learning_rate": 0.001, "loss": 2.0303, "step": 169500 }, { "epoch": 9.919868982862491, "grad_norm": 0.19138191640377045, "learning_rate": 0.001, "loss": 2.031, "step": 169600 }, { "epoch": 9.925717962215593, "grad_norm": 0.2571549117565155, "learning_rate": 0.001, "loss": 2.0345, "step": 169700 }, { "epoch": 9.931566941568697, "grad_norm": 0.16393856704235077, "learning_rate": 0.001, "loss": 2.0369, "step": 169800 }, { "epoch": 9.937415920921799, "grad_norm": 0.2369818389415741, "learning_rate": 0.001, "loss": 2.0355, "step": 169900 }, { "epoch": 9.943264900274903, "grad_norm": 0.2003248631954193, "learning_rate": 0.001, "loss": 2.0284, "step": 170000 }, { "epoch": 9.949113879628005, "grad_norm": 0.22155320644378662, "learning_rate": 0.001, "loss": 2.0298, "step": 170100 }, { "epoch": 9.954962858981109, "grad_norm": 0.31322985887527466, "learning_rate": 0.001, "loss": 2.0296, "step": 170200 }, { "epoch": 9.96081183833421, "grad_norm": 0.2524648606777191, "learning_rate": 0.001, "loss": 2.0329, "step": 170300 }, { "epoch": 9.966660817687314, "grad_norm": 0.2375606745481491, "learning_rate": 0.001, "loss": 2.0253, "step": 170400 }, { "epoch": 9.972509797040416, "grad_norm": 0.23473088443279266, "learning_rate": 0.001, "loss": 2.0268, "step": 170500 }, { "epoch": 9.97835877639352, "grad_norm": 0.3087780177593231, "learning_rate": 0.001, "loss": 2.0326, "step": 170600 }, { "epoch": 9.984207755746622, "grad_norm": 0.23942995071411133, "learning_rate": 0.001, "loss": 2.0303, "step": 170700 }, { "epoch": 9.990056735099724, "grad_norm": 0.27665162086486816, "learning_rate": 0.001, "loss": 2.0305, "step": 170800 }, { "epoch": 9.995905714452828, "grad_norm": 0.23798005282878876, "learning_rate": 0.001, "loss": 2.0305, "step": 170900 }, { "epoch": 10.00175469380593, "grad_norm": 0.23685629665851593, "learning_rate": 0.001, "loss": 2.031, "step": 171000 }, { "epoch": 10.007603673159034, "grad_norm": 0.3327007293701172, "learning_rate": 0.001, "loss": 2.0149, "step": 171100 }, { "epoch": 10.013452652512136, "grad_norm": 0.30209827423095703, "learning_rate": 0.001, "loss": 2.0202, "step": 171200 }, { "epoch": 10.01930163186524, "grad_norm": 0.21061380207538605, "learning_rate": 0.001, "loss": 2.0174, "step": 171300 }, { "epoch": 10.025150611218342, "grad_norm": 0.22616459429264069, "learning_rate": 0.001, "loss": 2.015, "step": 171400 }, { "epoch": 10.030999590571446, "grad_norm": 0.27842339873313904, "learning_rate": 0.001, "loss": 2.0207, "step": 171500 }, { "epoch": 10.036848569924548, "grad_norm": 0.23795592784881592, "learning_rate": 0.001, "loss": 2.0118, "step": 171600 }, { "epoch": 10.042697549277651, "grad_norm": 0.3537725806236267, "learning_rate": 0.001, "loss": 2.0134, "step": 171700 }, { "epoch": 10.048546528630753, "grad_norm": 0.31624871492385864, "learning_rate": 0.001, "loss": 2.0175, "step": 171800 }, { "epoch": 10.054395507983857, "grad_norm": 0.2975766658782959, "learning_rate": 0.001, "loss": 2.0101, "step": 171900 }, { "epoch": 10.06024448733696, "grad_norm": 0.34898146986961365, "learning_rate": 0.001, "loss": 2.0148, "step": 172000 }, { "epoch": 10.066093466690063, "grad_norm": 0.2460363805294037, "learning_rate": 0.001, "loss": 2.0222, "step": 172100 }, { "epoch": 10.071942446043165, "grad_norm": 0.18772847950458527, "learning_rate": 0.001, "loss": 2.0133, "step": 172200 }, { "epoch": 10.077791425396269, "grad_norm": 0.2119777947664261, "learning_rate": 0.001, "loss": 2.0229, "step": 172300 }, { "epoch": 10.08364040474937, "grad_norm": 0.21838730573654175, "learning_rate": 0.001, "loss": 2.0135, "step": 172400 }, { "epoch": 10.089489384102475, "grad_norm": 0.3392496705055237, "learning_rate": 0.001, "loss": 2.0201, "step": 172500 }, { "epoch": 10.095338363455577, "grad_norm": 0.2751774191856384, "learning_rate": 0.001, "loss": 2.0218, "step": 172600 }, { "epoch": 10.10118734280868, "grad_norm": 0.2273690551519394, "learning_rate": 0.001, "loss": 2.0143, "step": 172700 }, { "epoch": 10.107036322161782, "grad_norm": 0.1720428764820099, "learning_rate": 0.001, "loss": 2.0112, "step": 172800 }, { "epoch": 10.112885301514886, "grad_norm": 0.35446813702583313, "learning_rate": 0.001, "loss": 2.015, "step": 172900 }, { "epoch": 10.118734280867988, "grad_norm": 0.2301865518093109, "learning_rate": 0.001, "loss": 2.0179, "step": 173000 }, { "epoch": 10.124583260221092, "grad_norm": 0.25079113245010376, "learning_rate": 0.001, "loss": 2.016, "step": 173100 }, { "epoch": 10.130432239574194, "grad_norm": 0.32001379132270813, "learning_rate": 0.001, "loss": 2.0186, "step": 173200 }, { "epoch": 10.136281218927298, "grad_norm": 0.24986614286899567, "learning_rate": 0.001, "loss": 2.0236, "step": 173300 }, { "epoch": 10.1421301982804, "grad_norm": 0.28886914253234863, "learning_rate": 0.001, "loss": 2.0131, "step": 173400 }, { "epoch": 10.147979177633504, "grad_norm": 0.22892028093338013, "learning_rate": 0.001, "loss": 2.011, "step": 173500 }, { "epoch": 10.153828156986606, "grad_norm": 0.2188529372215271, "learning_rate": 0.001, "loss": 2.0214, "step": 173600 }, { "epoch": 10.15967713633971, "grad_norm": 0.27691227197647095, "learning_rate": 0.001, "loss": 2.0151, "step": 173700 }, { "epoch": 10.165526115692812, "grad_norm": 0.197740837931633, "learning_rate": 0.001, "loss": 2.0149, "step": 173800 }, { "epoch": 10.171375095045914, "grad_norm": 0.2507222592830658, "learning_rate": 0.001, "loss": 2.0143, "step": 173900 }, { "epoch": 10.177224074399017, "grad_norm": 0.19524338841438293, "learning_rate": 0.001, "loss": 2.0165, "step": 174000 }, { "epoch": 10.18307305375212, "grad_norm": 0.30183881521224976, "learning_rate": 0.001, "loss": 2.0204, "step": 174100 }, { "epoch": 10.188922033105223, "grad_norm": 0.22435249388217926, "learning_rate": 0.001, "loss": 2.0204, "step": 174200 }, { "epoch": 10.194771012458325, "grad_norm": 0.27792537212371826, "learning_rate": 0.001, "loss": 2.0224, "step": 174300 }, { "epoch": 10.200619991811429, "grad_norm": 0.22015632688999176, "learning_rate": 0.001, "loss": 2.0236, "step": 174400 }, { "epoch": 10.206468971164531, "grad_norm": 0.28682488203048706, "learning_rate": 0.001, "loss": 2.0177, "step": 174500 }, { "epoch": 10.212317950517635, "grad_norm": 0.22568200528621674, "learning_rate": 0.001, "loss": 2.0192, "step": 174600 }, { "epoch": 10.218166929870737, "grad_norm": 0.24316483736038208, "learning_rate": 0.001, "loss": 2.0141, "step": 174700 }, { "epoch": 10.22401590922384, "grad_norm": 0.22606278955936432, "learning_rate": 0.001, "loss": 2.0187, "step": 174800 }, { "epoch": 10.229864888576943, "grad_norm": 0.24719713628292084, "learning_rate": 0.001, "loss": 2.0193, "step": 174900 }, { "epoch": 10.235713867930047, "grad_norm": 0.256766140460968, "learning_rate": 0.001, "loss": 2.0215, "step": 175000 }, { "epoch": 10.235713867930047, "eval_ag_news_accuracy": 0.2345, "eval_ag_news_bleu_score": 6.151556613251694, "eval_ag_news_bleu_score_sem": 0.40609259520791036, "eval_ag_news_emb_cos_sim": 0.6796672940254211, "eval_ag_news_emb_cos_sim_sem": 0.01484731025993824, "eval_ag_news_emb_top1_equal": 0.9375, "eval_ag_news_emb_top1_equal_sem": 0.02147948183119297, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.8272666931152344, "eval_ag_news_n_ngrams_match_1": 11.84375, "eval_ag_news_n_ngrams_match_2": 3.2734375, "eval_ag_news_n_ngrams_match_3": 1.203125, "eval_ag_news_num_pred_words": 43.3203125, "eval_ag_news_num_true_words": 43.75, "eval_ag_news_perplexity": 16.89920692165615, "eval_ag_news_pred_num_tokens": 68.5625, "eval_ag_news_rouge_score": 0.25453888087530196, "eval_ag_news_runtime": 37.756, "eval_ag_news_samples_per_second": 13.243, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.29483316764207207, "eval_ag_news_token_set_f1_sem": 0.00956013219386986, "eval_ag_news_token_set_precision": 0.2685420426323291, "eval_ag_news_token_set_recall": 0.3431715687629611, "eval_ag_news_true_num_tokens": 60.6875, "step": 175000 }, { "epoch": 10.235713867930047, "eval_anthropic_toxic_prompts_accuracy": 0.099609375, "eval_anthropic_toxic_prompts_bleu_score": 35.18366133792688, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.5044551430416493, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8588425517082214, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011440555565059185, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.984375, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.011004959233105183, "eval_anthropic_toxic_prompts_exact_match": 0.1015625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02680456515850638, "eval_anthropic_toxic_prompts_loss": 1.352903962135315, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.296875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.28125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.2109375, "eval_anthropic_toxic_prompts_num_pred_words": 15.7890625, "eval_anthropic_toxic_prompts_num_true_words": 16.65625, "eval_anthropic_toxic_prompts_perplexity": 3.8686436289606956, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.8203125, "eval_anthropic_toxic_prompts_rouge_score": 0.6272829910319686, "eval_anthropic_toxic_prompts_runtime": 29.3979, "eval_anthropic_toxic_prompts_samples_per_second": 17.008, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6460945473258557, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.019592948564281967, "eval_anthropic_toxic_prompts_token_set_precision": 0.6289148644509387, "eval_anthropic_toxic_prompts_token_set_recall": 0.6711435044078838, "eval_anthropic_toxic_prompts_true_num_tokens": 20.015625, "step": 175000 }, { "epoch": 10.235713867930047, "eval_arxiv_accuracy": 0.37265625, "eval_arxiv_bleu_score": 1.2534707031551722, "eval_arxiv_bleu_score_sem": 0.12130828638556414, "eval_arxiv_emb_cos_sim": 0.42049843072891235, "eval_arxiv_emb_cos_sim_sem": 0.01911146566271782, "eval_arxiv_emb_top1_equal": 0.859375, "eval_arxiv_emb_top1_equal_sem": 0.03084755875170231, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4756202697753906, "eval_arxiv_n_ngrams_match_1": 10.2421875, "eval_arxiv_n_ngrams_match_2": 1.6328125, "eval_arxiv_n_ngrams_match_3": 0.3203125, "eval_arxiv_num_pred_words": 49.1328125, "eval_arxiv_num_true_words": 85.0859375, "eval_arxiv_perplexity": 32.31786811779345, "eval_arxiv_pred_num_tokens": 126.1015625, "eval_arxiv_rouge_score": 0.13527679557541467, "eval_arxiv_runtime": 30.8607, "eval_arxiv_samples_per_second": 16.202, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.1418901455119252, "eval_arxiv_token_set_f1_sem": 0.009075817233861232, "eval_arxiv_token_set_precision": 0.09289058565467673, "eval_arxiv_token_set_recall": 0.382085808142846, "eval_arxiv_true_num_tokens": 125.3984375, "step": 175000 }, { "epoch": 10.235713867930047, "eval_python_code_alpaca_accuracy": 0.126046875, "eval_python_code_alpaca_bleu_score": 26.272817581766265, "eval_python_code_alpaca_bleu_score_sem": 1.5982302252565825, "eval_python_code_alpaca_emb_cos_sim": 0.8593010306358337, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009095671586692333, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.6146284341812134, "eval_python_code_alpaca_n_ngrams_match_1": 10.0859375, "eval_python_code_alpaca_n_ngrams_match_2": 5.3125, "eval_python_code_alpaca_n_ngrams_match_3": 3.0, "eval_python_code_alpaca_num_pred_words": 17.2109375, "eval_python_code_alpaca_num_true_words": 18.5546875, "eval_python_code_alpaca_perplexity": 5.026020079210729, "eval_python_code_alpaca_pred_num_tokens": 23.6015625, "eval_python_code_alpaca_rouge_score": 0.5741623082547593, "eval_python_code_alpaca_runtime": 29.6447, "eval_python_code_alpaca_samples_per_second": 16.866, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6021859514676878, "eval_python_code_alpaca_token_set_f1_sem": 0.013174085820174939, "eval_python_code_alpaca_token_set_precision": 0.5777304900227467, "eval_python_code_alpaca_token_set_recall": 0.6352020913904927, "eval_python_code_alpaca_true_num_tokens": 23.9921875, "step": 175000 }, { "epoch": 10.235713867930047, "eval_wikibio_accuracy": 0.353109375, "eval_wikibio_bleu_score": 6.377215765363509, "eval_wikibio_bleu_score_sem": 0.6998462440852667, "eval_wikibio_emb_cos_sim": 0.5356417894363403, "eval_wikibio_emb_cos_sim_sem": 0.023860972374677658, "eval_wikibio_emb_top1_equal": 0.859375, "eval_wikibio_emb_top1_equal_sem": 0.03084755875170231, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.780834436416626, "eval_wikibio_n_ngrams_match_1": 12.4765625, "eval_wikibio_n_ngrams_match_2": 4.1171875, "eval_wikibio_n_ngrams_match_3": 1.6328125, "eval_wikibio_num_pred_words": 47.6015625, "eval_wikibio_num_true_words": 52.6015625, "eval_wikibio_perplexity": 16.132476859377398, "eval_wikibio_pred_num_tokens": 104.453125, "eval_wikibio_rouge_score": 0.24949201822889758, "eval_wikibio_runtime": 31.6476, "eval_wikibio_samples_per_second": 15.799, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.2778190507118052, "eval_wikibio_token_set_f1_sem": 0.015240676925459235, "eval_wikibio_token_set_precision": 0.23894461234425302, "eval_wikibio_token_set_recall": 0.38476927100045494, "eval_wikibio_true_num_tokens": 99.9375, "step": 175000 }, { "epoch": 10.235713867930047, "eval_msmarco_accuracy": 0.374796875, "eval_msmarco_bleu_score": 12.43088499179895, "eval_msmarco_bleu_score_sem": 1.0806155967445172, "eval_msmarco_emb_cos_sim": 0.7555303573608398, "eval_msmarco_emb_cos_sim_sem": 0.016498636454343796, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.9061672687530518, "eval_msmarco_n_ngrams_match_1": 25.0625, "eval_msmarco_n_ngrams_match_2": 10.1171875, "eval_msmarco_n_ngrams_match_3": 5.0390625, "eval_msmarco_num_pred_words": 57.8671875, "eval_msmarco_num_true_words": 63.421875, "eval_msmarco_perplexity": 6.72725556163069, "eval_msmarco_pred_num_tokens": 79.65625, "eval_msmarco_rouge_score": 0.3922410421360255, "eval_msmarco_runtime": 25.8614, "eval_msmarco_samples_per_second": 19.334, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4174172497626908, "eval_msmarco_token_set_f1_sem": 0.012885656641556314, "eval_msmarco_token_set_precision": 0.37233650461541884, "eval_msmarco_token_set_recall": 0.4936425464245062, "eval_msmarco_true_num_tokens": 81.5390625, "step": 175000 }, { "epoch": 10.241562847283149, "grad_norm": 0.2758668065071106, "learning_rate": 0.001, "loss": 2.0249, "step": 175100 }, { "epoch": 10.247411826636252, "grad_norm": 0.31300005316734314, "learning_rate": 0.001, "loss": 2.0202, "step": 175200 }, { "epoch": 10.253260805989354, "grad_norm": 0.27813494205474854, "learning_rate": 0.001, "loss": 2.0207, "step": 175300 }, { "epoch": 10.259109785342458, "grad_norm": 0.2724533677101135, "learning_rate": 0.001, "loss": 2.018, "step": 175400 }, { "epoch": 10.26495876469556, "grad_norm": 0.239198699593544, "learning_rate": 0.001, "loss": 2.0168, "step": 175500 }, { "epoch": 10.270807744048664, "grad_norm": 0.3504115641117096, "learning_rate": 0.001, "loss": 2.0147, "step": 175600 }, { "epoch": 10.276656723401766, "grad_norm": 0.2360936850309372, "learning_rate": 0.001, "loss": 2.0154, "step": 175700 }, { "epoch": 10.28250570275487, "grad_norm": 0.18917636573314667, "learning_rate": 0.001, "loss": 2.0121, "step": 175800 }, { "epoch": 10.288354682107972, "grad_norm": 0.24362166225910187, "learning_rate": 0.001, "loss": 2.0125, "step": 175900 }, { "epoch": 10.294203661461076, "grad_norm": 0.21267713606357574, "learning_rate": 0.001, "loss": 2.0182, "step": 176000 }, { "epoch": 10.300052640814178, "grad_norm": 0.1836772859096527, "learning_rate": 0.001, "loss": 2.0219, "step": 176100 }, { "epoch": 10.305901620167282, "grad_norm": 0.26108574867248535, "learning_rate": 0.001, "loss": 2.0188, "step": 176200 }, { "epoch": 10.311750599520384, "grad_norm": 0.2722971439361572, "learning_rate": 0.001, "loss": 2.0131, "step": 176300 }, { "epoch": 10.317599578873487, "grad_norm": 0.31263428926467896, "learning_rate": 0.001, "loss": 2.016, "step": 176400 }, { "epoch": 10.32344855822659, "grad_norm": 0.30773454904556274, "learning_rate": 0.001, "loss": 2.0153, "step": 176500 }, { "epoch": 10.329297537579693, "grad_norm": 0.3479011654853821, "learning_rate": 0.001, "loss": 2.0203, "step": 176600 }, { "epoch": 10.335146516932795, "grad_norm": 0.22559037804603577, "learning_rate": 0.001, "loss": 2.0187, "step": 176700 }, { "epoch": 10.340995496285899, "grad_norm": 0.2091871052980423, "learning_rate": 0.001, "loss": 2.0249, "step": 176800 }, { "epoch": 10.346844475639001, "grad_norm": 0.20897243916988373, "learning_rate": 0.001, "loss": 2.0148, "step": 176900 }, { "epoch": 10.352693454992103, "grad_norm": 0.2231776863336563, "learning_rate": 0.001, "loss": 2.0169, "step": 177000 }, { "epoch": 10.358542434345207, "grad_norm": 0.29804185032844543, "learning_rate": 0.001, "loss": 2.0161, "step": 177100 }, { "epoch": 10.364391413698309, "grad_norm": 0.2058599293231964, "learning_rate": 0.001, "loss": 2.0225, "step": 177200 }, { "epoch": 10.370240393051413, "grad_norm": 0.25718748569488525, "learning_rate": 0.001, "loss": 2.0196, "step": 177300 }, { "epoch": 10.376089372404515, "grad_norm": 0.2560168206691742, "learning_rate": 0.001, "loss": 2.0114, "step": 177400 }, { "epoch": 10.381938351757618, "grad_norm": 0.17987829446792603, "learning_rate": 0.001, "loss": 2.0168, "step": 177500 }, { "epoch": 10.38778733111072, "grad_norm": 0.19924317300319672, "learning_rate": 0.001, "loss": 2.0157, "step": 177600 }, { "epoch": 10.393636310463824, "grad_norm": 0.1960504800081253, "learning_rate": 0.001, "loss": 2.0184, "step": 177700 }, { "epoch": 10.399485289816926, "grad_norm": 0.3183183968067169, "learning_rate": 0.001, "loss": 2.0204, "step": 177800 }, { "epoch": 10.40533426917003, "grad_norm": 0.2212894707918167, "learning_rate": 0.001, "loss": 2.0228, "step": 177900 }, { "epoch": 10.411183248523132, "grad_norm": 0.1970527321100235, "learning_rate": 0.001, "loss": 2.0155, "step": 178000 }, { "epoch": 10.417032227876236, "grad_norm": 0.2213178277015686, "learning_rate": 0.001, "loss": 2.0121, "step": 178100 }, { "epoch": 10.422881207229338, "grad_norm": 0.32180729508399963, "learning_rate": 0.001, "loss": 2.0228, "step": 178200 }, { "epoch": 10.428730186582442, "grad_norm": 0.2852908968925476, "learning_rate": 0.001, "loss": 2.0172, "step": 178300 }, { "epoch": 10.434579165935544, "grad_norm": 0.2534542977809906, "learning_rate": 0.001, "loss": 2.0192, "step": 178400 }, { "epoch": 10.440428145288648, "grad_norm": 0.26526838541030884, "learning_rate": 0.001, "loss": 2.0147, "step": 178500 }, { "epoch": 10.44627712464175, "grad_norm": 0.28905776143074036, "learning_rate": 0.001, "loss": 2.031, "step": 178600 }, { "epoch": 10.452126103994853, "grad_norm": 0.31868672370910645, "learning_rate": 0.001, "loss": 2.0132, "step": 178700 }, { "epoch": 10.457975083347955, "grad_norm": 0.29139444231987, "learning_rate": 0.001, "loss": 2.0224, "step": 178800 }, { "epoch": 10.46382406270106, "grad_norm": 0.2434409260749817, "learning_rate": 0.001, "loss": 2.0187, "step": 178900 }, { "epoch": 10.469673042054161, "grad_norm": 0.2236948311328888, "learning_rate": 0.001, "loss": 2.0141, "step": 179000 }, { "epoch": 10.475522021407265, "grad_norm": 0.18333914875984192, "learning_rate": 0.001, "loss": 2.0228, "step": 179100 }, { "epoch": 10.481371000760367, "grad_norm": 0.26587095856666565, "learning_rate": 0.001, "loss": 2.0187, "step": 179200 }, { "epoch": 10.487219980113471, "grad_norm": 0.2555517256259918, "learning_rate": 0.001, "loss": 2.0199, "step": 179300 }, { "epoch": 10.493068959466573, "grad_norm": 0.2563992142677307, "learning_rate": 0.001, "loss": 2.0177, "step": 179400 }, { "epoch": 10.498917938819677, "grad_norm": 0.22177983820438385, "learning_rate": 0.001, "loss": 2.0116, "step": 179500 }, { "epoch": 10.504766918172779, "grad_norm": 0.30520832538604736, "learning_rate": 0.001, "loss": 2.0152, "step": 179600 }, { "epoch": 10.510615897525883, "grad_norm": 0.21793676912784576, "learning_rate": 0.001, "loss": 2.0233, "step": 179700 }, { "epoch": 10.516464876878985, "grad_norm": 0.27203842997550964, "learning_rate": 0.001, "loss": 2.0212, "step": 179800 }, { "epoch": 10.522313856232088, "grad_norm": 0.2595706582069397, "learning_rate": 0.001, "loss": 2.0221, "step": 179900 }, { "epoch": 10.52816283558519, "grad_norm": 0.2572323679924011, "learning_rate": 0.001, "loss": 2.0181, "step": 180000 }, { "epoch": 10.534011814938292, "grad_norm": 0.21040984988212585, "learning_rate": 0.001, "loss": 2.0138, "step": 180100 }, { "epoch": 10.539860794291396, "grad_norm": 0.29415932297706604, "learning_rate": 0.001, "loss": 2.022, "step": 180200 }, { "epoch": 10.545709773644498, "grad_norm": 0.18975061178207397, "learning_rate": 0.001, "loss": 2.015, "step": 180300 }, { "epoch": 10.551558752997602, "grad_norm": 0.2548379600048065, "learning_rate": 0.001, "loss": 2.0133, "step": 180400 }, { "epoch": 10.557407732350704, "grad_norm": 0.27733296155929565, "learning_rate": 0.001, "loss": 2.0167, "step": 180500 }, { "epoch": 10.563256711703808, "grad_norm": 0.2804897725582123, "learning_rate": 0.001, "loss": 2.0183, "step": 180600 }, { "epoch": 10.56910569105691, "grad_norm": 0.2301812469959259, "learning_rate": 0.001, "loss": 2.0156, "step": 180700 }, { "epoch": 10.574954670410014, "grad_norm": 0.2566080689430237, "learning_rate": 0.001, "loss": 2.0194, "step": 180800 }, { "epoch": 10.580803649763116, "grad_norm": 0.24456988275051117, "learning_rate": 0.001, "loss": 2.0157, "step": 180900 }, { "epoch": 10.58665262911622, "grad_norm": 0.24291947484016418, "learning_rate": 0.001, "loss": 2.0175, "step": 181000 }, { "epoch": 10.592501608469322, "grad_norm": 0.241173654794693, "learning_rate": 0.001, "loss": 2.0131, "step": 181100 }, { "epoch": 10.598350587822425, "grad_norm": 0.22204963862895966, "learning_rate": 0.001, "loss": 2.0148, "step": 181200 }, { "epoch": 10.604199567175527, "grad_norm": 0.23097024857997894, "learning_rate": 0.001, "loss": 2.018, "step": 181300 }, { "epoch": 10.610048546528631, "grad_norm": 0.261029988527298, "learning_rate": 0.001, "loss": 2.0174, "step": 181400 }, { "epoch": 10.615897525881733, "grad_norm": 0.26250317692756653, "learning_rate": 0.001, "loss": 2.0143, "step": 181500 }, { "epoch": 10.621746505234837, "grad_norm": 0.2545125186443329, "learning_rate": 0.001, "loss": 2.0154, "step": 181600 }, { "epoch": 10.627595484587939, "grad_norm": 0.25632375478744507, "learning_rate": 0.001, "loss": 2.02, "step": 181700 }, { "epoch": 10.633444463941043, "grad_norm": 0.2809964716434479, "learning_rate": 0.001, "loss": 2.0213, "step": 181800 }, { "epoch": 10.639293443294145, "grad_norm": 0.24211782217025757, "learning_rate": 0.001, "loss": 2.0239, "step": 181900 }, { "epoch": 10.645142422647249, "grad_norm": 0.23945142328739166, "learning_rate": 0.001, "loss": 2.0208, "step": 182000 }, { "epoch": 10.65099140200035, "grad_norm": 0.2121644765138626, "learning_rate": 0.001, "loss": 2.0222, "step": 182100 }, { "epoch": 10.656840381353454, "grad_norm": 0.25141504406929016, "learning_rate": 0.001, "loss": 2.0129, "step": 182200 }, { "epoch": 10.662689360706556, "grad_norm": 0.23346230387687683, "learning_rate": 0.001, "loss": 2.0187, "step": 182300 }, { "epoch": 10.66853834005966, "grad_norm": 0.2028612494468689, "learning_rate": 0.001, "loss": 2.0181, "step": 182400 }, { "epoch": 10.674387319412762, "grad_norm": 0.3339592516422272, "learning_rate": 0.001, "loss": 2.0179, "step": 182500 }, { "epoch": 10.680236298765866, "grad_norm": 0.27618664503097534, "learning_rate": 0.001, "loss": 2.0205, "step": 182600 }, { "epoch": 10.686085278118968, "grad_norm": 0.2017868459224701, "learning_rate": 0.001, "loss": 2.0145, "step": 182700 }, { "epoch": 10.691934257472072, "grad_norm": 0.28648778796195984, "learning_rate": 0.001, "loss": 2.0175, "step": 182800 }, { "epoch": 10.697783236825174, "grad_norm": 0.2781062126159668, "learning_rate": 0.001, "loss": 2.0167, "step": 182900 }, { "epoch": 10.703632216178278, "grad_norm": 0.22328762710094452, "learning_rate": 0.001, "loss": 2.0158, "step": 183000 }, { "epoch": 10.70948119553138, "grad_norm": 0.2758755683898926, "learning_rate": 0.001, "loss": 2.0208, "step": 183100 }, { "epoch": 10.715330174884482, "grad_norm": 0.2120978981256485, "learning_rate": 0.001, "loss": 2.0168, "step": 183200 }, { "epoch": 10.721179154237586, "grad_norm": 0.32760170102119446, "learning_rate": 0.001, "loss": 2.0216, "step": 183300 }, { "epoch": 10.727028133590688, "grad_norm": 0.2367255985736847, "learning_rate": 0.001, "loss": 2.0199, "step": 183400 }, { "epoch": 10.732877112943791, "grad_norm": 0.2277698963880539, "learning_rate": 0.001, "loss": 2.0122, "step": 183500 }, { "epoch": 10.738726092296893, "grad_norm": 0.25172653794288635, "learning_rate": 0.001, "loss": 2.0153, "step": 183600 }, { "epoch": 10.744575071649997, "grad_norm": 0.22678861021995544, "learning_rate": 0.001, "loss": 2.0189, "step": 183700 }, { "epoch": 10.7504240510031, "grad_norm": 0.25237175822257996, "learning_rate": 0.001, "loss": 2.0201, "step": 183800 }, { "epoch": 10.756273030356203, "grad_norm": 0.21524378657341003, "learning_rate": 0.001, "loss": 2.0156, "step": 183900 }, { "epoch": 10.762122009709305, "grad_norm": 0.2433275282382965, "learning_rate": 0.001, "loss": 2.0168, "step": 184000 }, { "epoch": 10.767970989062409, "grad_norm": 0.2455408126115799, "learning_rate": 0.001, "loss": 2.0167, "step": 184100 }, { "epoch": 10.773819968415511, "grad_norm": 0.2798559367656708, "learning_rate": 0.001, "loss": 2.0149, "step": 184200 }, { "epoch": 10.779668947768615, "grad_norm": 0.25583040714263916, "learning_rate": 0.001, "loss": 2.0183, "step": 184300 }, { "epoch": 10.785517927121717, "grad_norm": 0.23670853674411774, "learning_rate": 0.001, "loss": 2.0151, "step": 184400 }, { "epoch": 10.79136690647482, "grad_norm": 0.22647707164287567, "learning_rate": 0.001, "loss": 2.0119, "step": 184500 }, { "epoch": 10.797215885827923, "grad_norm": 0.23308242857456207, "learning_rate": 0.001, "loss": 2.0159, "step": 184600 }, { "epoch": 10.803064865181026, "grad_norm": 0.23094014823436737, "learning_rate": 0.001, "loss": 2.0206, "step": 184700 }, { "epoch": 10.808913844534128, "grad_norm": 0.25819072127342224, "learning_rate": 0.001, "loss": 2.0183, "step": 184800 }, { "epoch": 10.814762823887232, "grad_norm": 0.2505772113800049, "learning_rate": 0.001, "loss": 2.0162, "step": 184900 }, { "epoch": 10.820611803240334, "grad_norm": 0.21987120807170868, "learning_rate": 0.001, "loss": 2.0124, "step": 185000 }, { "epoch": 10.826460782593438, "grad_norm": 0.2525466978549957, "learning_rate": 0.001, "loss": 2.021, "step": 185100 }, { "epoch": 10.83230976194654, "grad_norm": 0.27500104904174805, "learning_rate": 0.001, "loss": 2.0229, "step": 185200 }, { "epoch": 10.838158741299644, "grad_norm": 0.24566251039505005, "learning_rate": 0.001, "loss": 2.0161, "step": 185300 }, { "epoch": 10.844007720652746, "grad_norm": 0.22641661763191223, "learning_rate": 0.001, "loss": 2.0138, "step": 185400 }, { "epoch": 10.84985670000585, "grad_norm": 0.24100361764431, "learning_rate": 0.001, "loss": 2.0167, "step": 185500 }, { "epoch": 10.855705679358952, "grad_norm": 0.23787255585193634, "learning_rate": 0.001, "loss": 2.0152, "step": 185600 }, { "epoch": 10.861554658712055, "grad_norm": 0.24687497317790985, "learning_rate": 0.001, "loss": 2.0188, "step": 185700 }, { "epoch": 10.867403638065158, "grad_norm": 0.27574941515922546, "learning_rate": 0.001, "loss": 2.0179, "step": 185800 }, { "epoch": 10.873252617418261, "grad_norm": 0.25548359751701355, "learning_rate": 0.001, "loss": 2.0095, "step": 185900 }, { "epoch": 10.879101596771363, "grad_norm": 0.25334054231643677, "learning_rate": 0.001, "loss": 2.0144, "step": 186000 }, { "epoch": 10.884950576124467, "grad_norm": 0.19309964776039124, "learning_rate": 0.001, "loss": 2.0137, "step": 186100 }, { "epoch": 10.89079955547757, "grad_norm": 0.3331020474433899, "learning_rate": 0.001, "loss": 2.0194, "step": 186200 }, { "epoch": 10.896648534830671, "grad_norm": 0.20689035952091217, "learning_rate": 0.001, "loss": 2.0184, "step": 186300 }, { "epoch": 10.902497514183775, "grad_norm": 0.28475692868232727, "learning_rate": 0.001, "loss": 2.0168, "step": 186400 }, { "epoch": 10.908346493536877, "grad_norm": 0.26652440428733826, "learning_rate": 0.001, "loss": 2.0188, "step": 186500 }, { "epoch": 10.91419547288998, "grad_norm": 0.20084771513938904, "learning_rate": 0.001, "loss": 2.0194, "step": 186600 }, { "epoch": 10.920044452243083, "grad_norm": 0.17400167882442474, "learning_rate": 0.001, "loss": 2.0144, "step": 186700 }, { "epoch": 10.925893431596187, "grad_norm": 0.3755057454109192, "learning_rate": 0.001, "loss": 2.0198, "step": 186800 }, { "epoch": 10.931742410949289, "grad_norm": 0.26133665442466736, "learning_rate": 0.001, "loss": 2.0262, "step": 186900 }, { "epoch": 10.937591390302392, "grad_norm": 0.26638728380203247, "learning_rate": 0.001, "loss": 2.0173, "step": 187000 }, { "epoch": 10.943440369655494, "grad_norm": 0.24956031143665314, "learning_rate": 0.001, "loss": 2.023, "step": 187100 }, { "epoch": 10.949289349008598, "grad_norm": 0.22023475170135498, "learning_rate": 0.001, "loss": 2.0162, "step": 187200 }, { "epoch": 10.9551383283617, "grad_norm": 0.2287941426038742, "learning_rate": 0.001, "loss": 2.0203, "step": 187300 }, { "epoch": 10.960987307714804, "grad_norm": 0.23611114919185638, "learning_rate": 0.001, "loss": 2.0191, "step": 187400 }, { "epoch": 10.966836287067906, "grad_norm": 0.22788207232952118, "learning_rate": 0.001, "loss": 2.0208, "step": 187500 }, { "epoch": 10.97268526642101, "grad_norm": 0.31827521324157715, "learning_rate": 0.001, "loss": 2.0186, "step": 187600 }, { "epoch": 10.978534245774112, "grad_norm": 0.1856229454278946, "learning_rate": 0.001, "loss": 2.0113, "step": 187700 }, { "epoch": 10.984383225127216, "grad_norm": 0.39983847737312317, "learning_rate": 0.001, "loss": 2.0201, "step": 187800 }, { "epoch": 10.990232204480318, "grad_norm": 0.2980829179286957, "learning_rate": 0.001, "loss": 2.0159, "step": 187900 }, { "epoch": 10.996081183833422, "grad_norm": 0.219138041138649, "learning_rate": 0.001, "loss": 2.0145, "step": 188000 }, { "epoch": 11.001930163186524, "grad_norm": 0.2186313420534134, "learning_rate": 0.001, "loss": 2.0087, "step": 188100 }, { "epoch": 11.007779142539627, "grad_norm": 0.3106873035430908, "learning_rate": 0.001, "loss": 1.9984, "step": 188200 }, { "epoch": 11.01362812189273, "grad_norm": 0.33530113101005554, "learning_rate": 0.001, "loss": 2.0037, "step": 188300 }, { "epoch": 11.019477101245833, "grad_norm": 0.22164186835289001, "learning_rate": 0.001, "loss": 1.9963, "step": 188400 }, { "epoch": 11.025326080598935, "grad_norm": 0.19216325879096985, "learning_rate": 0.001, "loss": 1.9983, "step": 188500 }, { "epoch": 11.031175059952039, "grad_norm": 0.2649822533130646, "learning_rate": 0.001, "loss": 1.9987, "step": 188600 }, { "epoch": 11.037024039305141, "grad_norm": 0.21572758257389069, "learning_rate": 0.001, "loss": 1.9973, "step": 188700 }, { "epoch": 11.042873018658245, "grad_norm": 0.20914825797080994, "learning_rate": 0.001, "loss": 1.9966, "step": 188800 }, { "epoch": 11.048721998011347, "grad_norm": 0.36171504855155945, "learning_rate": 0.001, "loss": 2.0028, "step": 188900 }, { "epoch": 11.05457097736445, "grad_norm": 0.17909982800483704, "learning_rate": 0.001, "loss": 1.9983, "step": 189000 }, { "epoch": 11.060419956717553, "grad_norm": 0.22673270106315613, "learning_rate": 0.001, "loss": 2.0005, "step": 189100 }, { "epoch": 11.066268936070657, "grad_norm": 0.23868079483509064, "learning_rate": 0.001, "loss": 2.0001, "step": 189200 }, { "epoch": 11.072117915423759, "grad_norm": 0.24784347414970398, "learning_rate": 0.001, "loss": 2.0025, "step": 189300 }, { "epoch": 11.07796689477686, "grad_norm": 0.25394967198371887, "learning_rate": 0.001, "loss": 2.0024, "step": 189400 }, { "epoch": 11.083815874129964, "grad_norm": 0.2160046249628067, "learning_rate": 0.001, "loss": 2.0021, "step": 189500 }, { "epoch": 11.089664853483066, "grad_norm": 0.28959327936172485, "learning_rate": 0.001, "loss": 2.0018, "step": 189600 }, { "epoch": 11.09551383283617, "grad_norm": 0.2557681202888489, "learning_rate": 0.001, "loss": 1.9966, "step": 189700 }, { "epoch": 11.101362812189272, "grad_norm": 0.25041264295578003, "learning_rate": 0.001, "loss": 2.0038, "step": 189800 }, { "epoch": 11.107211791542376, "grad_norm": 0.3133576214313507, "learning_rate": 0.001, "loss": 2.0028, "step": 189900 }, { "epoch": 11.113060770895478, "grad_norm": 0.2637767791748047, "learning_rate": 0.001, "loss": 2.0003, "step": 190000 }, { "epoch": 11.118909750248582, "grad_norm": 0.2710079848766327, "learning_rate": 0.001, "loss": 2.0021, "step": 190100 }, { "epoch": 11.124758729601684, "grad_norm": 0.18513745069503784, "learning_rate": 0.001, "loss": 2.0016, "step": 190200 }, { "epoch": 11.130607708954788, "grad_norm": 0.19571202993392944, "learning_rate": 0.001, "loss": 2.001, "step": 190300 }, { "epoch": 11.13645668830789, "grad_norm": 0.22393815219402313, "learning_rate": 0.001, "loss": 2.0001, "step": 190400 }, { "epoch": 11.142305667660994, "grad_norm": 0.26865142583847046, "learning_rate": 0.001, "loss": 1.9979, "step": 190500 }, { "epoch": 11.148154647014096, "grad_norm": 0.2897537648677826, "learning_rate": 0.001, "loss": 2.0023, "step": 190600 }, { "epoch": 11.1540036263672, "grad_norm": 0.2259036749601364, "learning_rate": 0.001, "loss": 2.0051, "step": 190700 }, { "epoch": 11.159852605720301, "grad_norm": 0.251522034406662, "learning_rate": 0.001, "loss": 2.0052, "step": 190800 }, { "epoch": 11.165701585073405, "grad_norm": 0.24849402904510498, "learning_rate": 0.001, "loss": 2.0074, "step": 190900 }, { "epoch": 11.171550564426507, "grad_norm": 0.2094811499118805, "learning_rate": 0.001, "loss": 2.0049, "step": 191000 }, { "epoch": 11.177399543779611, "grad_norm": 0.20695583522319794, "learning_rate": 0.001, "loss": 2.0065, "step": 191100 }, { "epoch": 11.183248523132713, "grad_norm": 0.3143620193004608, "learning_rate": 0.001, "loss": 2.0022, "step": 191200 }, { "epoch": 11.189097502485817, "grad_norm": 0.40891122817993164, "learning_rate": 0.001, "loss": 2.004, "step": 191300 }, { "epoch": 11.194946481838919, "grad_norm": 0.285576730966568, "learning_rate": 0.001, "loss": 2.0036, "step": 191400 }, { "epoch": 11.200795461192023, "grad_norm": 0.29602304100990295, "learning_rate": 0.001, "loss": 1.9982, "step": 191500 }, { "epoch": 11.206644440545125, "grad_norm": 0.28282618522644043, "learning_rate": 0.001, "loss": 2.0034, "step": 191600 }, { "epoch": 11.212493419898228, "grad_norm": 0.3086845576763153, "learning_rate": 0.001, "loss": 2.0037, "step": 191700 }, { "epoch": 11.21834239925133, "grad_norm": 0.27019116282463074, "learning_rate": 0.001, "loss": 1.9998, "step": 191800 }, { "epoch": 11.224191378604434, "grad_norm": 0.2094966620206833, "learning_rate": 0.001, "loss": 2.001, "step": 191900 }, { "epoch": 11.230040357957536, "grad_norm": 0.2786903381347656, "learning_rate": 0.001, "loss": 1.9965, "step": 192000 }, { "epoch": 11.23588933731064, "grad_norm": 0.23663155734539032, "learning_rate": 0.001, "loss": 2.0077, "step": 192100 }, { "epoch": 11.241738316663742, "grad_norm": 0.2248982936143875, "learning_rate": 0.001, "loss": 2.0034, "step": 192200 }, { "epoch": 11.247587296016846, "grad_norm": 0.25248831510543823, "learning_rate": 0.001, "loss": 2.0032, "step": 192300 }, { "epoch": 11.253436275369948, "grad_norm": 0.30615153908729553, "learning_rate": 0.001, "loss": 2.0032, "step": 192400 }, { "epoch": 11.25928525472305, "grad_norm": 0.23003172874450684, "learning_rate": 0.001, "loss": 2.0, "step": 192500 }, { "epoch": 11.265134234076154, "grad_norm": 0.2533700168132782, "learning_rate": 0.001, "loss": 2.0008, "step": 192600 }, { "epoch": 11.270983213429256, "grad_norm": 0.2751188278198242, "learning_rate": 0.001, "loss": 2.0091, "step": 192700 }, { "epoch": 11.27683219278236, "grad_norm": 0.3191012442111969, "learning_rate": 0.001, "loss": 2.0061, "step": 192800 }, { "epoch": 11.282681172135462, "grad_norm": 0.22529400885105133, "learning_rate": 0.001, "loss": 1.9971, "step": 192900 }, { "epoch": 11.288530151488565, "grad_norm": 0.21129187941551208, "learning_rate": 0.001, "loss": 2.0031, "step": 193000 }, { "epoch": 11.294379130841667, "grad_norm": 0.3076889216899872, "learning_rate": 0.001, "loss": 2.0037, "step": 193100 }, { "epoch": 11.300228110194771, "grad_norm": 0.23607444763183594, "learning_rate": 0.001, "loss": 2.0001, "step": 193200 }, { "epoch": 11.306077089547873, "grad_norm": 0.24121956527233124, "learning_rate": 0.001, "loss": 2.0052, "step": 193300 }, { "epoch": 11.311926068900977, "grad_norm": 0.21479696035385132, "learning_rate": 0.001, "loss": 2.0049, "step": 193400 }, { "epoch": 11.317775048254079, "grad_norm": 0.2280740886926651, "learning_rate": 0.001, "loss": 2.0085, "step": 193500 }, { "epoch": 11.323624027607183, "grad_norm": 0.29918399453163147, "learning_rate": 0.001, "loss": 1.9973, "step": 193600 }, { "epoch": 11.329473006960285, "grad_norm": 0.2641150951385498, "learning_rate": 0.001, "loss": 2.0054, "step": 193700 }, { "epoch": 11.335321986313389, "grad_norm": 0.23915159702301025, "learning_rate": 0.001, "loss": 2.0005, "step": 193800 }, { "epoch": 11.34117096566649, "grad_norm": 0.24459904432296753, "learning_rate": 0.001, "loss": 2.0042, "step": 193900 }, { "epoch": 11.347019945019595, "grad_norm": 0.18952134251594543, "learning_rate": 0.001, "loss": 2.0007, "step": 194000 }, { "epoch": 11.352868924372697, "grad_norm": 0.2601996660232544, "learning_rate": 0.001, "loss": 2.0069, "step": 194100 }, { "epoch": 11.3587179037258, "grad_norm": 0.24816280603408813, "learning_rate": 0.001, "loss": 2.0029, "step": 194200 }, { "epoch": 11.364566883078902, "grad_norm": 0.22129762172698975, "learning_rate": 0.001, "loss": 2.0032, "step": 194300 }, { "epoch": 11.370415862432006, "grad_norm": 0.22058776021003723, "learning_rate": 0.001, "loss": 2.0033, "step": 194400 }, { "epoch": 11.376264841785108, "grad_norm": 0.2274969220161438, "learning_rate": 0.001, "loss": 2.0029, "step": 194500 }, { "epoch": 11.382113821138212, "grad_norm": 0.24912381172180176, "learning_rate": 0.001, "loss": 2.0031, "step": 194600 }, { "epoch": 11.387962800491314, "grad_norm": 0.22388483583927155, "learning_rate": 0.001, "loss": 2.0058, "step": 194700 }, { "epoch": 11.393811779844418, "grad_norm": 0.20228897035121918, "learning_rate": 0.001, "loss": 2.003, "step": 194800 }, { "epoch": 11.39966075919752, "grad_norm": 0.2841532826423645, "learning_rate": 0.001, "loss": 2.0074, "step": 194900 }, { "epoch": 11.405509738550624, "grad_norm": 0.28116095066070557, "learning_rate": 0.001, "loss": 2.0015, "step": 195000 }, { "epoch": 11.411358717903726, "grad_norm": 0.2731442153453827, "learning_rate": 0.001, "loss": 2.0067, "step": 195100 }, { "epoch": 11.41720769725683, "grad_norm": 0.21980546414852142, "learning_rate": 0.001, "loss": 2.0061, "step": 195200 }, { "epoch": 11.423056676609932, "grad_norm": 0.23793433606624603, "learning_rate": 0.001, "loss": 2.0053, "step": 195300 }, { "epoch": 11.428905655963035, "grad_norm": 0.1830388307571411, "learning_rate": 0.001, "loss": 2.0063, "step": 195400 }, { "epoch": 11.434754635316137, "grad_norm": 0.2747815251350403, "learning_rate": 0.001, "loss": 2.0044, "step": 195500 }, { "epoch": 11.44060361466924, "grad_norm": 0.19600221514701843, "learning_rate": 0.001, "loss": 2.0037, "step": 195600 }, { "epoch": 11.446452594022343, "grad_norm": 0.2408866137266159, "learning_rate": 0.001, "loss": 2.0072, "step": 195700 }, { "epoch": 11.452301573375445, "grad_norm": 0.18318156898021698, "learning_rate": 0.001, "loss": 2.0007, "step": 195800 }, { "epoch": 11.458150552728549, "grad_norm": 0.20794811844825745, "learning_rate": 0.001, "loss": 2.0042, "step": 195900 }, { "epoch": 11.463999532081651, "grad_norm": 0.2751065194606781, "learning_rate": 0.001, "loss": 2.0009, "step": 196000 }, { "epoch": 11.469848511434755, "grad_norm": 0.2596251666545868, "learning_rate": 0.001, "loss": 2.0111, "step": 196100 }, { "epoch": 11.475697490787857, "grad_norm": 0.19436004757881165, "learning_rate": 0.001, "loss": 2.0022, "step": 196200 }, { "epoch": 11.48154647014096, "grad_norm": 0.2515554428100586, "learning_rate": 0.001, "loss": 2.0035, "step": 196300 }, { "epoch": 11.487395449494063, "grad_norm": 0.2224837690591812, "learning_rate": 0.001, "loss": 2.0005, "step": 196400 }, { "epoch": 11.493244428847166, "grad_norm": 0.2709154188632965, "learning_rate": 0.001, "loss": 2.0058, "step": 196500 }, { "epoch": 11.499093408200268, "grad_norm": 0.272002637386322, "learning_rate": 0.001, "loss": 2.0044, "step": 196600 }, { "epoch": 11.504942387553372, "grad_norm": 0.21350309252738953, "learning_rate": 0.001, "loss": 2.0045, "step": 196700 }, { "epoch": 11.510791366906474, "grad_norm": 0.28760209679603577, "learning_rate": 0.001, "loss": 2.0066, "step": 196800 }, { "epoch": 11.516640346259578, "grad_norm": 0.19713732600212097, "learning_rate": 0.001, "loss": 2.002, "step": 196900 }, { "epoch": 11.52248932561268, "grad_norm": 0.29070165753364563, "learning_rate": 0.001, "loss": 2.0014, "step": 197000 }, { "epoch": 11.528338304965784, "grad_norm": 0.27396097779273987, "learning_rate": 0.001, "loss": 2.0054, "step": 197100 }, { "epoch": 11.534187284318886, "grad_norm": 0.23292513191699982, "learning_rate": 0.001, "loss": 2.0091, "step": 197200 }, { "epoch": 11.54003626367199, "grad_norm": 0.24754372239112854, "learning_rate": 0.001, "loss": 2.0058, "step": 197300 }, { "epoch": 11.545885243025092, "grad_norm": 0.2667694091796875, "learning_rate": 0.001, "loss": 2.0092, "step": 197400 }, { "epoch": 11.551734222378196, "grad_norm": 0.2060028463602066, "learning_rate": 0.001, "loss": 1.997, "step": 197500 }, { "epoch": 11.557583201731298, "grad_norm": 0.21151739358901978, "learning_rate": 0.001, "loss": 2.0155, "step": 197600 }, { "epoch": 11.563432181084401, "grad_norm": 0.24915218353271484, "learning_rate": 0.001, "loss": 2.003, "step": 197700 }, { "epoch": 11.569281160437503, "grad_norm": 0.2133326232433319, "learning_rate": 0.001, "loss": 2.0038, "step": 197800 }, { "epoch": 11.575130139790607, "grad_norm": 0.22748351097106934, "learning_rate": 0.001, "loss": 2.0094, "step": 197900 }, { "epoch": 11.58097911914371, "grad_norm": 0.21045786142349243, "learning_rate": 0.001, "loss": 2.0031, "step": 198000 }, { "epoch": 11.586828098496813, "grad_norm": 0.2661415934562683, "learning_rate": 0.001, "loss": 1.997, "step": 198100 }, { "epoch": 11.592677077849915, "grad_norm": 0.2826390862464905, "learning_rate": 0.001, "loss": 1.9999, "step": 198200 }, { "epoch": 11.598526057203019, "grad_norm": 0.28920742869377136, "learning_rate": 0.001, "loss": 2.0046, "step": 198300 }, { "epoch": 11.604375036556121, "grad_norm": 0.2932990789413452, "learning_rate": 0.001, "loss": 2.0061, "step": 198400 }, { "epoch": 11.610224015909225, "grad_norm": 0.23478125035762787, "learning_rate": 0.001, "loss": 2.0034, "step": 198500 }, { "epoch": 11.616072995262327, "grad_norm": 0.26904553174972534, "learning_rate": 0.001, "loss": 2.0003, "step": 198600 }, { "epoch": 11.621921974615429, "grad_norm": 0.25289273262023926, "learning_rate": 0.001, "loss": 2.0109, "step": 198700 }, { "epoch": 11.627770953968533, "grad_norm": 0.19304195046424866, "learning_rate": 0.001, "loss": 2.0016, "step": 198800 }, { "epoch": 11.633619933321635, "grad_norm": 0.21612083911895752, "learning_rate": 0.001, "loss": 2.0005, "step": 198900 }, { "epoch": 11.639468912674738, "grad_norm": 0.378410279750824, "learning_rate": 0.001, "loss": 2.0097, "step": 199000 }, { "epoch": 11.64531789202784, "grad_norm": 0.2701795697212219, "learning_rate": 0.001, "loss": 2.0122, "step": 199100 }, { "epoch": 11.651166871380944, "grad_norm": 0.2229175567626953, "learning_rate": 0.001, "loss": 2.0004, "step": 199200 }, { "epoch": 11.657015850734046, "grad_norm": 0.35914769768714905, "learning_rate": 0.001, "loss": 2.0039, "step": 199300 }, { "epoch": 11.66286483008715, "grad_norm": 0.27085983753204346, "learning_rate": 0.001, "loss": 2.003, "step": 199400 }, { "epoch": 11.668713809440252, "grad_norm": 0.217853382229805, "learning_rate": 0.001, "loss": 2.0068, "step": 199500 }, { "epoch": 11.674562788793356, "grad_norm": 0.21649140119552612, "learning_rate": 0.001, "loss": 2.0016, "step": 199600 }, { "epoch": 11.680411768146458, "grad_norm": 0.23176294565200806, "learning_rate": 0.001, "loss": 1.9912, "step": 199700 }, { "epoch": 11.686260747499562, "grad_norm": 0.19618314504623413, "learning_rate": 0.001, "loss": 2.0005, "step": 199800 }, { "epoch": 11.692109726852664, "grad_norm": 0.20554816722869873, "learning_rate": 0.001, "loss": 1.9961, "step": 199900 }, { "epoch": 11.697958706205767, "grad_norm": 0.2511617839336395, "learning_rate": 0.001, "loss": 2.0061, "step": 200000 }, { "epoch": 11.697958706205767, "eval_ag_news_accuracy": 0.23609375, "eval_ag_news_bleu_score": 6.223502563873451, "eval_ag_news_bleu_score_sem": 0.4532127005588969, "eval_ag_news_emb_cos_sim": 0.6984884738922119, "eval_ag_news_emb_cos_sim_sem": 0.014999207109212875, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7820069789886475, "eval_ag_news_n_ngrams_match_1": 12.578125, "eval_ag_news_n_ngrams_match_2": 3.3046875, "eval_ag_news_n_ngrams_match_3": 1.1640625, "eval_ag_news_num_pred_words": 43.5390625, "eval_ag_news_num_true_words": 43.328125, "eval_ag_news_perplexity": 16.15140396953988, "eval_ag_news_pred_num_tokens": 69.5, "eval_ag_news_rouge_score": 0.278974484971796, "eval_ag_news_runtime": 42.6019, "eval_ag_news_samples_per_second": 11.737, "eval_ag_news_steps_per_second": 0.023, "eval_ag_news_token_set_f1": 0.310614795833766, "eval_ag_news_token_set_f1_sem": 0.009261656636344647, "eval_ag_news_token_set_precision": 0.2874556847351621, "eval_ag_news_token_set_recall": 0.3468109818796285, "eval_ag_news_true_num_tokens": 59.609375, "step": 200000 }, { "epoch": 11.697958706205767, "eval_anthropic_toxic_prompts_accuracy": 0.10290625, "eval_anthropic_toxic_prompts_bleu_score": 40.273935619144886, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.4829804463894782, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.872675895690918, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010661820881068707, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1015625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02680456515850638, "eval_anthropic_toxic_prompts_loss": 1.3661582469940186, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.0546875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.5390625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.6796875, "eval_anthropic_toxic_prompts_num_pred_words": 14.71875, "eval_anthropic_toxic_prompts_num_true_words": 14.875, "eval_anthropic_toxic_prompts_perplexity": 3.920261054003371, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.78125, "eval_anthropic_toxic_prompts_rouge_score": 0.657085778867142, "eval_anthropic_toxic_prompts_runtime": 29.5438, "eval_anthropic_toxic_prompts_samples_per_second": 16.924, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.678342927576033, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01839547386923549, "eval_anthropic_toxic_prompts_token_set_precision": 0.6732127856346126, "eval_anthropic_toxic_prompts_token_set_recall": 0.6899566963265478, "eval_anthropic_toxic_prompts_true_num_tokens": 18.4140625, "step": 200000 }, { "epoch": 11.697958706205767, "eval_arxiv_accuracy": 0.372671875, "eval_arxiv_bleu_score": 1.2028527280191317, "eval_arxiv_bleu_score_sem": 0.11924049832109503, "eval_arxiv_emb_cos_sim": 0.42587408423423767, "eval_arxiv_emb_cos_sim_sem": 0.017607398331165314, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4792211055755615, "eval_arxiv_n_ngrams_match_1": 11.3359375, "eval_arxiv_n_ngrams_match_2": 1.609375, "eval_arxiv_n_ngrams_match_3": 0.1875, "eval_arxiv_num_pred_words": 49.359375, "eval_arxiv_num_true_words": 87.640625, "eval_arxiv_perplexity": 32.43444922303986, "eval_arxiv_pred_num_tokens": 125.5546875, "eval_arxiv_rouge_score": 0.15614080896829535, "eval_arxiv_runtime": 31.9555, "eval_arxiv_samples_per_second": 15.647, "eval_arxiv_steps_per_second": 0.031, "eval_arxiv_token_set_f1": 0.1463517542545519, "eval_arxiv_token_set_f1_sem": 0.007950761284575416, "eval_arxiv_token_set_precision": 0.0961768039496215, "eval_arxiv_token_set_recall": 0.36561227959108106, "eval_arxiv_true_num_tokens": 125.8984375, "step": 200000 }, { "epoch": 11.697958706205767, "eval_python_code_alpaca_accuracy": 0.1234375, "eval_python_code_alpaca_bleu_score": 22.615230594519183, "eval_python_code_alpaca_bleu_score_sem": 1.3142410862635496, "eval_python_code_alpaca_emb_cos_sim": 0.8442756533622742, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009130834601819515, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.5745282173156738, "eval_python_code_alpaca_n_ngrams_match_1": 9.765625, "eval_python_code_alpaca_n_ngrams_match_2": 4.9921875, "eval_python_code_alpaca_n_ngrams_match_3": 2.6953125, "eval_python_code_alpaca_num_pred_words": 17.6015625, "eval_python_code_alpaca_num_true_words": 19.2109375, "eval_python_code_alpaca_perplexity": 4.828463095388446, "eval_python_code_alpaca_pred_num_tokens": 23.9765625, "eval_python_code_alpaca_rouge_score": 0.537223954082589, "eval_python_code_alpaca_runtime": 30.4857, "eval_python_code_alpaca_samples_per_second": 16.401, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.5620510073285079, "eval_python_code_alpaca_token_set_f1_sem": 0.013101256343594799, "eval_python_code_alpaca_token_set_precision": 0.540218114543415, "eval_python_code_alpaca_token_set_recall": 0.5938240968942289, "eval_python_code_alpaca_true_num_tokens": 24.640625, "step": 200000 }, { "epoch": 11.697958706205767, "eval_wikibio_accuracy": 0.3541875, "eval_wikibio_bleu_score": 6.628109065934494, "eval_wikibio_bleu_score_sem": 0.6702766649152152, "eval_wikibio_emb_cos_sim": 0.5362804532051086, "eval_wikibio_emb_cos_sim_sem": 0.025255369022488594, "eval_wikibio_emb_top1_equal": 0.8515625, "eval_wikibio_emb_top1_equal_sem": 0.03154846653342247, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7999677658081055, "eval_wikibio_n_ngrams_match_1": 14.0546875, "eval_wikibio_n_ngrams_match_2": 4.5390625, "eval_wikibio_n_ngrams_match_3": 1.90625, "eval_wikibio_num_pred_words": 50.484375, "eval_wikibio_num_true_words": 51.28125, "eval_wikibio_perplexity": 16.44411669974065, "eval_wikibio_pred_num_tokens": 105.921875, "eval_wikibio_rouge_score": 0.27027667342424366, "eval_wikibio_runtime": 32.2217, "eval_wikibio_samples_per_second": 15.518, "eval_wikibio_steps_per_second": 0.031, "eval_wikibio_token_set_f1": 0.28461674864586156, "eval_wikibio_token_set_f1_sem": 0.014435223560180392, "eval_wikibio_token_set_precision": 0.25200039653031453, "eval_wikibio_token_set_recall": 0.38990794942283713, "eval_wikibio_true_num_tokens": 97.8515625, "step": 200000 }, { "epoch": 11.697958706205767, "eval_msmarco_accuracy": 0.378140625, "eval_msmarco_bleu_score": 14.0509514431435, "eval_msmarco_bleu_score_sem": 1.1600630112425885, "eval_msmarco_emb_cos_sim": 0.7497991323471069, "eval_msmarco_emb_cos_sim_sem": 0.017147626727819443, "eval_msmarco_emb_top1_equal": 0.9453125, "eval_msmarco_emb_top1_equal_sem": 0.020175758749246597, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8621516227722168, "eval_msmarco_n_ngrams_match_1": 27.234375, "eval_msmarco_n_ngrams_match_2": 11.53125, "eval_msmarco_n_ngrams_match_3": 6.140625, "eval_msmarco_num_pred_words": 61.953125, "eval_msmarco_num_true_words": 62.3203125, "eval_msmarco_perplexity": 6.437573109701602, "eval_msmarco_pred_num_tokens": 86.1328125, "eval_msmarco_rouge_score": 0.4102348703886356, "eval_msmarco_runtime": 29.2662, "eval_msmarco_samples_per_second": 17.085, "eval_msmarco_steps_per_second": 0.034, "eval_msmarco_token_set_f1": 0.4396028109572814, "eval_msmarco_token_set_f1_sem": 0.0136303053344236, "eval_msmarco_token_set_precision": 0.4000530499470596, "eval_msmarco_token_set_recall": 0.5036857197158381, "eval_msmarco_true_num_tokens": 80.296875, "step": 200000 }, { "epoch": 11.70380768555887, "grad_norm": 0.24904869496822357, "learning_rate": 0.001, "loss": 2.0049, "step": 200100 }, { "epoch": 11.709656664911973, "grad_norm": 0.2802499234676361, "learning_rate": 0.001, "loss": 2.0007, "step": 200200 }, { "epoch": 11.715505644265075, "grad_norm": 0.18793617188930511, "learning_rate": 0.001, "loss": 1.9986, "step": 200300 }, { "epoch": 11.72135462361818, "grad_norm": 0.24140073359012604, "learning_rate": 0.001, "loss": 2.002, "step": 200400 }, { "epoch": 11.727203602971281, "grad_norm": 0.15938468277454376, "learning_rate": 0.001, "loss": 2.0017, "step": 200500 }, { "epoch": 11.733052582324385, "grad_norm": 0.2689951956272125, "learning_rate": 0.001, "loss": 2.0039, "step": 200600 }, { "epoch": 11.738901561677487, "grad_norm": 0.26711276173591614, "learning_rate": 0.001, "loss": 2.004, "step": 200700 }, { "epoch": 11.74475054103059, "grad_norm": 0.24721994996070862, "learning_rate": 0.001, "loss": 1.9983, "step": 200800 }, { "epoch": 11.750599520383693, "grad_norm": 0.18210195004940033, "learning_rate": 0.001, "loss": 2.0036, "step": 200900 }, { "epoch": 11.756448499736797, "grad_norm": 0.23689894378185272, "learning_rate": 0.001, "loss": 2.0076, "step": 201000 }, { "epoch": 11.762297479089899, "grad_norm": 0.2419663965702057, "learning_rate": 0.001, "loss": 2.0086, "step": 201100 }, { "epoch": 11.768146458443002, "grad_norm": 0.2624891698360443, "learning_rate": 0.001, "loss": 2.0073, "step": 201200 }, { "epoch": 11.773995437796104, "grad_norm": 0.18944473564624786, "learning_rate": 0.001, "loss": 1.9982, "step": 201300 }, { "epoch": 11.779844417149208, "grad_norm": 0.2507529556751251, "learning_rate": 0.001, "loss": 2.004, "step": 201400 }, { "epoch": 11.78569339650231, "grad_norm": 0.24678467214107513, "learning_rate": 0.001, "loss": 1.9972, "step": 201500 }, { "epoch": 11.791542375855414, "grad_norm": 0.26755234599113464, "learning_rate": 0.001, "loss": 1.9983, "step": 201600 }, { "epoch": 11.797391355208516, "grad_norm": 0.22987939417362213, "learning_rate": 0.001, "loss": 2.0014, "step": 201700 }, { "epoch": 11.803240334561618, "grad_norm": 0.24088235199451447, "learning_rate": 0.001, "loss": 2.0042, "step": 201800 }, { "epoch": 11.809089313914722, "grad_norm": 0.194310262799263, "learning_rate": 0.001, "loss": 2.0065, "step": 201900 }, { "epoch": 11.814938293267824, "grad_norm": 0.241758331656456, "learning_rate": 0.001, "loss": 2.0056, "step": 202000 }, { "epoch": 11.820787272620928, "grad_norm": 0.2521415650844574, "learning_rate": 0.001, "loss": 1.9974, "step": 202100 }, { "epoch": 11.82663625197403, "grad_norm": 0.2574783265590668, "learning_rate": 0.001, "loss": 2.0062, "step": 202200 }, { "epoch": 11.832485231327134, "grad_norm": 0.2307811677455902, "learning_rate": 0.001, "loss": 2.006, "step": 202300 }, { "epoch": 11.838334210680236, "grad_norm": 0.20845665037631989, "learning_rate": 0.001, "loss": 2.0023, "step": 202400 }, { "epoch": 11.84418319003334, "grad_norm": 0.22249333560466766, "learning_rate": 0.001, "loss": 2.0057, "step": 202500 }, { "epoch": 11.850032169386441, "grad_norm": 0.20681613683700562, "learning_rate": 0.001, "loss": 2.0081, "step": 202600 }, { "epoch": 11.855881148739545, "grad_norm": 0.29444485902786255, "learning_rate": 0.001, "loss": 2.0047, "step": 202700 }, { "epoch": 11.861730128092647, "grad_norm": 0.1991613209247589, "learning_rate": 0.001, "loss": 1.9973, "step": 202800 }, { "epoch": 11.867579107445751, "grad_norm": 0.26996177434921265, "learning_rate": 0.001, "loss": 2.0105, "step": 202900 }, { "epoch": 11.873428086798853, "grad_norm": 0.20059554278850555, "learning_rate": 0.001, "loss": 1.9985, "step": 203000 }, { "epoch": 11.879277066151957, "grad_norm": 0.19560621678829193, "learning_rate": 0.001, "loss": 2.0011, "step": 203100 }, { "epoch": 11.885126045505059, "grad_norm": 0.2511279881000519, "learning_rate": 0.001, "loss": 2.0009, "step": 203200 }, { "epoch": 11.890975024858163, "grad_norm": 0.2607780992984772, "learning_rate": 0.001, "loss": 2.0016, "step": 203300 }, { "epoch": 11.896824004211265, "grad_norm": 0.20962415635585785, "learning_rate": 0.001, "loss": 2.0022, "step": 203400 }, { "epoch": 11.902672983564369, "grad_norm": 0.2351568639278412, "learning_rate": 0.001, "loss": 2.0044, "step": 203500 }, { "epoch": 11.90852196291747, "grad_norm": 0.21231655776500702, "learning_rate": 0.001, "loss": 2.0012, "step": 203600 }, { "epoch": 11.914370942270574, "grad_norm": 0.2125813364982605, "learning_rate": 0.001, "loss": 2.0075, "step": 203700 }, { "epoch": 11.920219921623676, "grad_norm": 0.18582268059253693, "learning_rate": 0.001, "loss": 2.0016, "step": 203800 }, { "epoch": 11.92606890097678, "grad_norm": 0.36274757981300354, "learning_rate": 0.001, "loss": 2.0045, "step": 203900 }, { "epoch": 11.931917880329882, "grad_norm": 0.2602193057537079, "learning_rate": 0.001, "loss": 2.0066, "step": 204000 }, { "epoch": 11.937766859682986, "grad_norm": 0.2528907060623169, "learning_rate": 0.001, "loss": 2.0012, "step": 204100 }, { "epoch": 11.943615839036088, "grad_norm": 0.249723419547081, "learning_rate": 0.001, "loss": 2.0099, "step": 204200 }, { "epoch": 11.949464818389192, "grad_norm": 0.21000756323337555, "learning_rate": 0.001, "loss": 2.003, "step": 204300 }, { "epoch": 11.955313797742294, "grad_norm": 0.22554467618465424, "learning_rate": 0.001, "loss": 2.0016, "step": 204400 }, { "epoch": 11.961162777095398, "grad_norm": 0.31926417350769043, "learning_rate": 0.001, "loss": 2.0038, "step": 204500 }, { "epoch": 11.9670117564485, "grad_norm": 0.2609155476093292, "learning_rate": 0.001, "loss": 1.9989, "step": 204600 }, { "epoch": 11.972860735801603, "grad_norm": 0.3196989595890045, "learning_rate": 0.001, "loss": 2.0093, "step": 204700 }, { "epoch": 11.978709715154706, "grad_norm": 0.24776658415794373, "learning_rate": 0.001, "loss": 2.0043, "step": 204800 }, { "epoch": 11.984558694507808, "grad_norm": 0.14726880192756653, "learning_rate": 0.001, "loss": 2.0011, "step": 204900 }, { "epoch": 11.990407673860911, "grad_norm": 0.2722688615322113, "learning_rate": 0.001, "loss": 2.0017, "step": 205000 }, { "epoch": 11.996256653214013, "grad_norm": 0.2032519429922104, "learning_rate": 0.001, "loss": 1.9986, "step": 205100 }, { "epoch": 12.002105632567117, "grad_norm": 0.2962583899497986, "learning_rate": 0.001, "loss": 1.9934, "step": 205200 }, { "epoch": 12.00795461192022, "grad_norm": 0.26926642656326294, "learning_rate": 0.001, "loss": 1.9858, "step": 205300 }, { "epoch": 12.013803591273323, "grad_norm": 0.22411508858203888, "learning_rate": 0.001, "loss": 1.9829, "step": 205400 }, { "epoch": 12.019652570626425, "grad_norm": 0.23125705122947693, "learning_rate": 0.001, "loss": 1.978, "step": 205500 }, { "epoch": 12.025501549979529, "grad_norm": 0.2443566918373108, "learning_rate": 0.001, "loss": 1.9809, "step": 205600 }, { "epoch": 12.03135052933263, "grad_norm": 0.2558284103870392, "learning_rate": 0.001, "loss": 1.9857, "step": 205700 }, { "epoch": 12.037199508685735, "grad_norm": 0.31702834367752075, "learning_rate": 0.001, "loss": 1.9858, "step": 205800 }, { "epoch": 12.043048488038837, "grad_norm": 0.32670629024505615, "learning_rate": 0.001, "loss": 1.9875, "step": 205900 }, { "epoch": 12.04889746739194, "grad_norm": 0.24383263289928436, "learning_rate": 0.001, "loss": 1.9898, "step": 206000 }, { "epoch": 12.054746446745042, "grad_norm": 0.26001864671707153, "learning_rate": 0.001, "loss": 1.9829, "step": 206100 }, { "epoch": 12.060595426098146, "grad_norm": 0.26366937160491943, "learning_rate": 0.001, "loss": 1.9893, "step": 206200 }, { "epoch": 12.066444405451248, "grad_norm": 0.20987975597381592, "learning_rate": 0.001, "loss": 1.9861, "step": 206300 }, { "epoch": 12.072293384804352, "grad_norm": 0.28744930028915405, "learning_rate": 0.001, "loss": 1.9963, "step": 206400 }, { "epoch": 12.078142364157454, "grad_norm": 0.2699495553970337, "learning_rate": 0.001, "loss": 1.9906, "step": 206500 }, { "epoch": 12.083991343510558, "grad_norm": 0.2067594975233078, "learning_rate": 0.001, "loss": 1.9821, "step": 206600 }, { "epoch": 12.08984032286366, "grad_norm": 0.2402385175228119, "learning_rate": 0.001, "loss": 1.9853, "step": 206700 }, { "epoch": 12.095689302216764, "grad_norm": 0.25266438722610474, "learning_rate": 0.001, "loss": 1.9883, "step": 206800 }, { "epoch": 12.101538281569866, "grad_norm": 0.30691173672676086, "learning_rate": 0.001, "loss": 1.9874, "step": 206900 }, { "epoch": 12.10738726092297, "grad_norm": 0.26781442761421204, "learning_rate": 0.001, "loss": 1.9932, "step": 207000 }, { "epoch": 12.113236240276072, "grad_norm": 0.218129962682724, "learning_rate": 0.001, "loss": 1.9845, "step": 207100 }, { "epoch": 12.119085219629175, "grad_norm": 0.29707178473472595, "learning_rate": 0.001, "loss": 1.9903, "step": 207200 }, { "epoch": 12.124934198982277, "grad_norm": 0.2550409734249115, "learning_rate": 0.001, "loss": 1.9891, "step": 207300 }, { "epoch": 12.130783178335381, "grad_norm": 0.24537059664726257, "learning_rate": 0.001, "loss": 1.9859, "step": 207400 }, { "epoch": 12.136632157688483, "grad_norm": 0.27574947476387024, "learning_rate": 0.001, "loss": 1.9874, "step": 207500 }, { "epoch": 12.142481137041587, "grad_norm": 0.21475866436958313, "learning_rate": 0.001, "loss": 1.9935, "step": 207600 }, { "epoch": 12.148330116394689, "grad_norm": 0.25040602684020996, "learning_rate": 0.001, "loss": 1.9839, "step": 207700 }, { "epoch": 12.154179095747793, "grad_norm": 0.20602312684059143, "learning_rate": 0.001, "loss": 1.988, "step": 207800 }, { "epoch": 12.160028075100895, "grad_norm": 0.2398747205734253, "learning_rate": 0.001, "loss": 1.9924, "step": 207900 }, { "epoch": 12.165877054453997, "grad_norm": 0.24003161489963531, "learning_rate": 0.001, "loss": 1.9926, "step": 208000 }, { "epoch": 12.1717260338071, "grad_norm": 0.2282644361257553, "learning_rate": 0.001, "loss": 1.9917, "step": 208100 }, { "epoch": 12.177575013160203, "grad_norm": 0.2105446308851242, "learning_rate": 0.001, "loss": 1.9853, "step": 208200 }, { "epoch": 12.183423992513307, "grad_norm": 0.3018331229686737, "learning_rate": 0.001, "loss": 1.9898, "step": 208300 }, { "epoch": 12.189272971866409, "grad_norm": 0.35882529616355896, "learning_rate": 0.001, "loss": 1.9933, "step": 208400 }, { "epoch": 12.195121951219512, "grad_norm": 0.22786220908164978, "learning_rate": 0.001, "loss": 1.9848, "step": 208500 }, { "epoch": 12.200970930572614, "grad_norm": 0.2393561452627182, "learning_rate": 0.001, "loss": 1.9859, "step": 208600 }, { "epoch": 12.206819909925718, "grad_norm": 0.2952001690864563, "learning_rate": 0.001, "loss": 1.9943, "step": 208700 }, { "epoch": 12.21266888927882, "grad_norm": 0.22450965642929077, "learning_rate": 0.001, "loss": 1.9995, "step": 208800 }, { "epoch": 12.218517868631924, "grad_norm": 0.2549421191215515, "learning_rate": 0.001, "loss": 1.9852, "step": 208900 }, { "epoch": 12.224366847985026, "grad_norm": 0.24438557028770447, "learning_rate": 0.001, "loss": 1.9965, "step": 209000 }, { "epoch": 12.23021582733813, "grad_norm": 0.21665479242801666, "learning_rate": 0.001, "loss": 1.9865, "step": 209100 }, { "epoch": 12.236064806691232, "grad_norm": 0.2841055989265442, "learning_rate": 0.001, "loss": 1.9865, "step": 209200 }, { "epoch": 12.241913786044336, "grad_norm": 0.23206572234630585, "learning_rate": 0.001, "loss": 1.992, "step": 209300 }, { "epoch": 12.247762765397438, "grad_norm": 0.29309725761413574, "learning_rate": 0.001, "loss": 1.9924, "step": 209400 }, { "epoch": 12.253611744750541, "grad_norm": 0.26699554920196533, "learning_rate": 0.001, "loss": 1.9897, "step": 209500 }, { "epoch": 12.259460724103644, "grad_norm": 0.22153447568416595, "learning_rate": 0.001, "loss": 1.9853, "step": 209600 }, { "epoch": 12.265309703456747, "grad_norm": 0.2783917188644409, "learning_rate": 0.001, "loss": 1.9952, "step": 209700 }, { "epoch": 12.27115868280985, "grad_norm": 0.2332148402929306, "learning_rate": 0.001, "loss": 1.991, "step": 209800 }, { "epoch": 12.277007662162953, "grad_norm": 0.25457561016082764, "learning_rate": 0.001, "loss": 1.9877, "step": 209900 }, { "epoch": 12.282856641516055, "grad_norm": 0.22801639139652252, "learning_rate": 0.001, "loss": 1.9948, "step": 210000 }, { "epoch": 12.288705620869159, "grad_norm": 0.32516172528266907, "learning_rate": 0.001, "loss": 1.9954, "step": 210100 }, { "epoch": 12.294554600222261, "grad_norm": 0.3649633228778839, "learning_rate": 0.001, "loss": 1.9881, "step": 210200 }, { "epoch": 12.300403579575365, "grad_norm": 0.2303839772939682, "learning_rate": 0.001, "loss": 1.9856, "step": 210300 }, { "epoch": 12.306252558928467, "grad_norm": 0.2549816370010376, "learning_rate": 0.001, "loss": 1.9912, "step": 210400 }, { "epoch": 12.31210153828157, "grad_norm": 0.32278990745544434, "learning_rate": 0.001, "loss": 1.9911, "step": 210500 }, { "epoch": 12.317950517634673, "grad_norm": 0.27344274520874023, "learning_rate": 0.001, "loss": 1.99, "step": 210600 }, { "epoch": 12.323799496987776, "grad_norm": 0.29085177183151245, "learning_rate": 0.001, "loss": 1.9901, "step": 210700 }, { "epoch": 12.329648476340878, "grad_norm": 0.22320674359798431, "learning_rate": 0.001, "loss": 1.9957, "step": 210800 }, { "epoch": 12.335497455693982, "grad_norm": 0.22829949855804443, "learning_rate": 0.001, "loss": 1.9872, "step": 210900 }, { "epoch": 12.341346435047084, "grad_norm": 0.31023287773132324, "learning_rate": 0.001, "loss": 1.993, "step": 211000 }, { "epoch": 12.347195414400186, "grad_norm": 0.25514641404151917, "learning_rate": 0.001, "loss": 1.9916, "step": 211100 }, { "epoch": 12.35304439375329, "grad_norm": 0.259879469871521, "learning_rate": 0.001, "loss": 1.9886, "step": 211200 }, { "epoch": 12.358893373106392, "grad_norm": 0.25703856348991394, "learning_rate": 0.001, "loss": 1.9932, "step": 211300 }, { "epoch": 12.364742352459496, "grad_norm": 0.3231908082962036, "learning_rate": 0.001, "loss": 1.993, "step": 211400 }, { "epoch": 12.370591331812598, "grad_norm": 0.2986411154270172, "learning_rate": 0.001, "loss": 1.9888, "step": 211500 }, { "epoch": 12.376440311165702, "grad_norm": 0.2803761661052704, "learning_rate": 0.001, "loss": 1.9949, "step": 211600 }, { "epoch": 12.382289290518804, "grad_norm": 0.19850309193134308, "learning_rate": 0.001, "loss": 1.9972, "step": 211700 }, { "epoch": 12.388138269871908, "grad_norm": 0.2987041771411896, "learning_rate": 0.001, "loss": 1.9848, "step": 211800 }, { "epoch": 12.39398724922501, "grad_norm": 0.23376832902431488, "learning_rate": 0.001, "loss": 1.9901, "step": 211900 }, { "epoch": 12.399836228578113, "grad_norm": 0.2801917791366577, "learning_rate": 0.001, "loss": 1.987, "step": 212000 }, { "epoch": 12.405685207931215, "grad_norm": 0.24053789675235748, "learning_rate": 0.001, "loss": 1.9984, "step": 212100 }, { "epoch": 12.41153418728432, "grad_norm": 0.20089595019817352, "learning_rate": 0.001, "loss": 1.9837, "step": 212200 }, { "epoch": 12.417383166637421, "grad_norm": 0.20909340679645538, "learning_rate": 0.001, "loss": 1.988, "step": 212300 }, { "epoch": 12.423232145990525, "grad_norm": 0.25517866015434265, "learning_rate": 0.001, "loss": 1.99, "step": 212400 }, { "epoch": 12.429081125343627, "grad_norm": 0.21270594000816345, "learning_rate": 0.001, "loss": 1.99, "step": 212500 }, { "epoch": 12.43493010469673, "grad_norm": 0.22485698759555817, "learning_rate": 0.001, "loss": 1.9893, "step": 212600 }, { "epoch": 12.440779084049833, "grad_norm": 0.2598619759082794, "learning_rate": 0.001, "loss": 1.9907, "step": 212700 }, { "epoch": 12.446628063402937, "grad_norm": 0.2912479341030121, "learning_rate": 0.001, "loss": 1.9943, "step": 212800 }, { "epoch": 12.452477042756039, "grad_norm": 0.21971791982650757, "learning_rate": 0.001, "loss": 1.9945, "step": 212900 }, { "epoch": 12.458326022109143, "grad_norm": 0.24492166936397552, "learning_rate": 0.001, "loss": 1.9995, "step": 213000 }, { "epoch": 12.464175001462245, "grad_norm": 0.2565488815307617, "learning_rate": 0.001, "loss": 1.9885, "step": 213100 }, { "epoch": 12.470023980815348, "grad_norm": 0.29776903986930847, "learning_rate": 0.001, "loss": 1.9998, "step": 213200 }, { "epoch": 12.47587296016845, "grad_norm": 0.2642241418361664, "learning_rate": 0.001, "loss": 1.9939, "step": 213300 }, { "epoch": 12.481721939521554, "grad_norm": 0.2514534890651703, "learning_rate": 0.001, "loss": 1.9867, "step": 213400 }, { "epoch": 12.487570918874656, "grad_norm": 0.26708242297172546, "learning_rate": 0.001, "loss": 1.9913, "step": 213500 }, { "epoch": 12.49341989822776, "grad_norm": 0.31386813521385193, "learning_rate": 0.001, "loss": 1.9952, "step": 213600 }, { "epoch": 12.499268877580862, "grad_norm": 0.23421123623847961, "learning_rate": 0.001, "loss": 1.9892, "step": 213700 }, { "epoch": 12.505117856933966, "grad_norm": 0.22968915104866028, "learning_rate": 0.001, "loss": 1.9912, "step": 213800 }, { "epoch": 12.510966836287068, "grad_norm": 0.252972275018692, "learning_rate": 0.001, "loss": 1.9995, "step": 213900 }, { "epoch": 12.516815815640172, "grad_norm": 0.2670825719833374, "learning_rate": 0.001, "loss": 1.9931, "step": 214000 }, { "epoch": 12.522664794993274, "grad_norm": 0.21752998232841492, "learning_rate": 0.001, "loss": 1.9859, "step": 214100 }, { "epoch": 12.528513774346376, "grad_norm": 0.20093129575252533, "learning_rate": 0.001, "loss": 1.9881, "step": 214200 }, { "epoch": 12.53436275369948, "grad_norm": 0.260749489068985, "learning_rate": 0.001, "loss": 1.9897, "step": 214300 }, { "epoch": 12.540211733052582, "grad_norm": 0.21012678742408752, "learning_rate": 0.001, "loss": 1.9922, "step": 214400 }, { "epoch": 12.546060712405685, "grad_norm": 0.275325745344162, "learning_rate": 0.001, "loss": 1.9971, "step": 214500 }, { "epoch": 12.551909691758787, "grad_norm": 0.2167966514825821, "learning_rate": 0.001, "loss": 1.9886, "step": 214600 }, { "epoch": 12.557758671111891, "grad_norm": 0.23587220907211304, "learning_rate": 0.001, "loss": 1.9991, "step": 214700 }, { "epoch": 12.563607650464993, "grad_norm": 0.24020154774188995, "learning_rate": 0.001, "loss": 1.9909, "step": 214800 }, { "epoch": 12.569456629818097, "grad_norm": 0.26482248306274414, "learning_rate": 0.001, "loss": 1.9912, "step": 214900 }, { "epoch": 12.575305609171199, "grad_norm": 0.2901129424571991, "learning_rate": 0.001, "loss": 1.9958, "step": 215000 }, { "epoch": 12.581154588524303, "grad_norm": 0.24587619304656982, "learning_rate": 0.001, "loss": 1.9926, "step": 215100 }, { "epoch": 12.587003567877405, "grad_norm": 0.25166720151901245, "learning_rate": 0.001, "loss": 1.9881, "step": 215200 }, { "epoch": 12.592852547230509, "grad_norm": 0.4902677536010742, "learning_rate": 0.001, "loss": 1.9963, "step": 215300 }, { "epoch": 12.59870152658361, "grad_norm": 0.23948544263839722, "learning_rate": 0.001, "loss": 1.9879, "step": 215400 }, { "epoch": 12.604550505936714, "grad_norm": 0.3060111701488495, "learning_rate": 0.001, "loss": 1.9952, "step": 215500 }, { "epoch": 12.610399485289816, "grad_norm": 0.22864434123039246, "learning_rate": 0.001, "loss": 1.9945, "step": 215600 }, { "epoch": 12.61624846464292, "grad_norm": 0.37697935104370117, "learning_rate": 0.001, "loss": 1.9889, "step": 215700 }, { "epoch": 12.622097443996022, "grad_norm": 0.2860703766345978, "learning_rate": 0.001, "loss": 1.9983, "step": 215800 }, { "epoch": 12.627946423349126, "grad_norm": 0.24686746299266815, "learning_rate": 0.001, "loss": 1.9923, "step": 215900 }, { "epoch": 12.633795402702228, "grad_norm": 0.24406294524669647, "learning_rate": 0.001, "loss": 1.9873, "step": 216000 }, { "epoch": 12.639644382055332, "grad_norm": 0.25791046023368835, "learning_rate": 0.001, "loss": 1.99, "step": 216100 }, { "epoch": 12.645493361408434, "grad_norm": 0.24631169438362122, "learning_rate": 0.001, "loss": 1.9916, "step": 216200 }, { "epoch": 12.651342340761538, "grad_norm": 0.2228984534740448, "learning_rate": 0.001, "loss": 1.9888, "step": 216300 }, { "epoch": 12.65719132011464, "grad_norm": 0.2650911509990692, "learning_rate": 0.001, "loss": 1.9968, "step": 216400 }, { "epoch": 12.663040299467744, "grad_norm": 0.2878989279270172, "learning_rate": 0.001, "loss": 1.9946, "step": 216500 }, { "epoch": 12.668889278820846, "grad_norm": 0.26413217186927795, "learning_rate": 0.001, "loss": 1.9938, "step": 216600 }, { "epoch": 12.67473825817395, "grad_norm": 0.22635801136493683, "learning_rate": 0.001, "loss": 1.9906, "step": 216700 }, { "epoch": 12.680587237527051, "grad_norm": 0.1767072230577469, "learning_rate": 0.001, "loss": 1.9798, "step": 216800 }, { "epoch": 12.686436216880155, "grad_norm": 0.21948282420635223, "learning_rate": 0.001, "loss": 1.9903, "step": 216900 }, { "epoch": 12.692285196233257, "grad_norm": 0.25149187445640564, "learning_rate": 0.001, "loss": 1.9854, "step": 217000 }, { "epoch": 12.698134175586361, "grad_norm": 0.418552041053772, "learning_rate": 0.001, "loss": 1.9892, "step": 217100 }, { "epoch": 12.703983154939463, "grad_norm": 0.2702252268791199, "learning_rate": 0.001, "loss": 1.9919, "step": 217200 }, { "epoch": 12.709832134292565, "grad_norm": 0.2981821298599243, "learning_rate": 0.001, "loss": 1.9895, "step": 217300 }, { "epoch": 12.715681113645669, "grad_norm": 0.26637002825737, "learning_rate": 0.001, "loss": 1.9907, "step": 217400 }, { "epoch": 12.721530092998771, "grad_norm": 0.28115782141685486, "learning_rate": 0.001, "loss": 1.9978, "step": 217500 }, { "epoch": 12.727379072351875, "grad_norm": 0.36924585700035095, "learning_rate": 0.001, "loss": 1.9933, "step": 217600 }, { "epoch": 12.733228051704977, "grad_norm": 0.2673327326774597, "learning_rate": 0.001, "loss": 1.995, "step": 217700 }, { "epoch": 12.73907703105808, "grad_norm": 0.29030096530914307, "learning_rate": 0.001, "loss": 1.9947, "step": 217800 }, { "epoch": 12.744926010411183, "grad_norm": 0.19573605060577393, "learning_rate": 0.001, "loss": 1.9959, "step": 217900 }, { "epoch": 12.750774989764286, "grad_norm": 0.21373245120048523, "learning_rate": 0.001, "loss": 1.9833, "step": 218000 }, { "epoch": 12.756623969117388, "grad_norm": 0.22951368987560272, "learning_rate": 0.001, "loss": 1.9947, "step": 218100 }, { "epoch": 12.762472948470492, "grad_norm": 0.24913440644741058, "learning_rate": 0.001, "loss": 1.9849, "step": 218200 }, { "epoch": 12.768321927823594, "grad_norm": 0.23290890455245972, "learning_rate": 0.001, "loss": 1.9903, "step": 218300 }, { "epoch": 12.774170907176698, "grad_norm": 0.2959122359752655, "learning_rate": 0.001, "loss": 1.9901, "step": 218400 }, { "epoch": 12.7800198865298, "grad_norm": 0.24441851675510406, "learning_rate": 0.001, "loss": 1.9853, "step": 218500 }, { "epoch": 12.785868865882904, "grad_norm": 0.1715906709432602, "learning_rate": 0.001, "loss": 1.9893, "step": 218600 }, { "epoch": 12.791717845236006, "grad_norm": 0.21632139384746552, "learning_rate": 0.001, "loss": 1.9933, "step": 218700 }, { "epoch": 12.79756682458911, "grad_norm": 0.2288036048412323, "learning_rate": 0.001, "loss": 1.9924, "step": 218800 }, { "epoch": 12.803415803942212, "grad_norm": 0.2464945912361145, "learning_rate": 0.001, "loss": 1.9891, "step": 218900 }, { "epoch": 12.809264783295315, "grad_norm": 0.26075947284698486, "learning_rate": 0.001, "loss": 1.9876, "step": 219000 }, { "epoch": 12.815113762648418, "grad_norm": 0.29697543382644653, "learning_rate": 0.001, "loss": 1.9959, "step": 219100 }, { "epoch": 12.820962742001521, "grad_norm": 0.374957412481308, "learning_rate": 0.001, "loss": 2.0011, "step": 219200 }, { "epoch": 12.826811721354623, "grad_norm": 0.2598373293876648, "learning_rate": 0.001, "loss": 1.9959, "step": 219300 }, { "epoch": 12.832660700707727, "grad_norm": 0.21245189011096954, "learning_rate": 0.001, "loss": 1.9897, "step": 219400 }, { "epoch": 12.83850968006083, "grad_norm": 0.263327956199646, "learning_rate": 0.001, "loss": 1.9918, "step": 219500 }, { "epoch": 12.844358659413933, "grad_norm": 0.23482419550418854, "learning_rate": 0.001, "loss": 1.9961, "step": 219600 }, { "epoch": 12.850207638767035, "grad_norm": 0.27459055185317993, "learning_rate": 0.001, "loss": 1.9873, "step": 219700 }, { "epoch": 12.856056618120139, "grad_norm": 0.2805071771144867, "learning_rate": 0.001, "loss": 1.9948, "step": 219800 }, { "epoch": 12.86190559747324, "grad_norm": 0.28957125544548035, "learning_rate": 0.001, "loss": 1.9928, "step": 219900 }, { "epoch": 12.867754576826345, "grad_norm": 0.28324463963508606, "learning_rate": 0.001, "loss": 1.9821, "step": 220000 }, { "epoch": 12.873603556179447, "grad_norm": 0.24029484391212463, "learning_rate": 0.001, "loss": 1.9892, "step": 220100 }, { "epoch": 12.87945253553255, "grad_norm": 0.257506400346756, "learning_rate": 0.001, "loss": 1.9862, "step": 220200 }, { "epoch": 12.885301514885652, "grad_norm": 0.30591273307800293, "learning_rate": 0.001, "loss": 1.9882, "step": 220300 }, { "epoch": 12.891150494238754, "grad_norm": 0.2803141474723816, "learning_rate": 0.001, "loss": 1.991, "step": 220400 }, { "epoch": 12.896999473591858, "grad_norm": 0.34142813086509705, "learning_rate": 0.001, "loss": 1.9967, "step": 220500 }, { "epoch": 12.90284845294496, "grad_norm": 0.24441038072109222, "learning_rate": 0.001, "loss": 1.9932, "step": 220600 }, { "epoch": 12.908697432298064, "grad_norm": 0.2529493570327759, "learning_rate": 0.001, "loss": 1.9944, "step": 220700 }, { "epoch": 12.914546411651166, "grad_norm": 0.23719379305839539, "learning_rate": 0.001, "loss": 1.989, "step": 220800 }, { "epoch": 12.92039539100427, "grad_norm": 0.3116234242916107, "learning_rate": 0.001, "loss": 1.9938, "step": 220900 }, { "epoch": 12.926244370357372, "grad_norm": 0.23318657279014587, "learning_rate": 0.001, "loss": 1.9869, "step": 221000 }, { "epoch": 12.932093349710476, "grad_norm": 0.2602800726890564, "learning_rate": 0.001, "loss": 1.9881, "step": 221100 }, { "epoch": 12.937942329063578, "grad_norm": 0.2972777783870697, "learning_rate": 0.001, "loss": 1.9966, "step": 221200 }, { "epoch": 12.943791308416682, "grad_norm": 0.26664453744888306, "learning_rate": 0.001, "loss": 1.9892, "step": 221300 }, { "epoch": 12.949640287769784, "grad_norm": 0.19203750789165497, "learning_rate": 0.001, "loss": 1.9857, "step": 221400 }, { "epoch": 12.955489267122887, "grad_norm": 0.19561320543289185, "learning_rate": 0.001, "loss": 1.9891, "step": 221500 }, { "epoch": 12.96133824647599, "grad_norm": 0.2920706272125244, "learning_rate": 0.001, "loss": 1.9875, "step": 221600 }, { "epoch": 12.967187225829093, "grad_norm": 0.25829678773880005, "learning_rate": 0.001, "loss": 1.9895, "step": 221700 }, { "epoch": 12.973036205182195, "grad_norm": 0.30420997738838196, "learning_rate": 0.001, "loss": 1.9989, "step": 221800 }, { "epoch": 12.978885184535299, "grad_norm": 0.2595987021923065, "learning_rate": 0.001, "loss": 1.9914, "step": 221900 }, { "epoch": 12.984734163888401, "grad_norm": 0.25430557131767273, "learning_rate": 0.001, "loss": 1.9892, "step": 222000 }, { "epoch": 12.990583143241505, "grad_norm": 0.30784767866134644, "learning_rate": 0.001, "loss": 1.9859, "step": 222100 }, { "epoch": 12.996432122594607, "grad_norm": 0.2967911660671234, "learning_rate": 0.001, "loss": 1.9936, "step": 222200 }, { "epoch": 13.00228110194771, "grad_norm": 0.27160167694091797, "learning_rate": 0.001, "loss": 1.9814, "step": 222300 }, { "epoch": 13.008130081300813, "grad_norm": 0.22919349372386932, "learning_rate": 0.001, "loss": 1.9733, "step": 222400 }, { "epoch": 13.013979060653917, "grad_norm": 0.2664661705493927, "learning_rate": 0.001, "loss": 1.9776, "step": 222500 }, { "epoch": 13.019828040007019, "grad_norm": 0.2195388227701187, "learning_rate": 0.001, "loss": 1.9708, "step": 222600 }, { "epoch": 13.025677019360122, "grad_norm": 0.2528992295265198, "learning_rate": 0.001, "loss": 1.97, "step": 222700 }, { "epoch": 13.031525998713224, "grad_norm": 0.29100343585014343, "learning_rate": 0.001, "loss": 1.9782, "step": 222800 }, { "epoch": 13.037374978066328, "grad_norm": 0.17042969167232513, "learning_rate": 0.001, "loss": 1.9734, "step": 222900 }, { "epoch": 13.04322395741943, "grad_norm": 0.24760393798351288, "learning_rate": 0.001, "loss": 1.9752, "step": 223000 }, { "epoch": 13.049072936772534, "grad_norm": 0.22083652019500732, "learning_rate": 0.001, "loss": 1.9711, "step": 223100 }, { "epoch": 13.054921916125636, "grad_norm": 0.31539714336395264, "learning_rate": 0.001, "loss": 1.9761, "step": 223200 }, { "epoch": 13.06077089547874, "grad_norm": 0.3016059100627899, "learning_rate": 0.001, "loss": 1.9774, "step": 223300 }, { "epoch": 13.066619874831842, "grad_norm": 0.3155173361301422, "learning_rate": 0.001, "loss": 1.9788, "step": 223400 }, { "epoch": 13.072468854184944, "grad_norm": 0.2169780135154724, "learning_rate": 0.001, "loss": 1.9774, "step": 223500 }, { "epoch": 13.078317833538048, "grad_norm": 0.2470683753490448, "learning_rate": 0.001, "loss": 1.9729, "step": 223600 }, { "epoch": 13.08416681289115, "grad_norm": 0.22286275029182434, "learning_rate": 0.001, "loss": 1.9774, "step": 223700 }, { "epoch": 13.090015792244253, "grad_norm": 0.2392960786819458, "learning_rate": 0.001, "loss": 1.9694, "step": 223800 }, { "epoch": 13.095864771597356, "grad_norm": 0.20342795550823212, "learning_rate": 0.001, "loss": 1.9732, "step": 223900 }, { "epoch": 13.10171375095046, "grad_norm": 0.19710732996463776, "learning_rate": 0.001, "loss": 1.9789, "step": 224000 }, { "epoch": 13.107562730303561, "grad_norm": 0.2079150527715683, "learning_rate": 0.001, "loss": 1.9762, "step": 224100 }, { "epoch": 13.113411709656665, "grad_norm": 0.22876454889774323, "learning_rate": 0.001, "loss": 1.9748, "step": 224200 }, { "epoch": 13.119260689009767, "grad_norm": 0.24685467779636383, "learning_rate": 0.001, "loss": 1.9813, "step": 224300 }, { "epoch": 13.125109668362871, "grad_norm": 0.1787969470024109, "learning_rate": 0.001, "loss": 1.9769, "step": 224400 }, { "epoch": 13.130958647715973, "grad_norm": 0.19456087052822113, "learning_rate": 0.001, "loss": 1.978, "step": 224500 }, { "epoch": 13.136807627069077, "grad_norm": 0.16664652526378632, "learning_rate": 0.001, "loss": 1.9735, "step": 224600 }, { "epoch": 13.142656606422179, "grad_norm": 0.2061654031276703, "learning_rate": 0.001, "loss": 1.9786, "step": 224700 }, { "epoch": 13.148505585775283, "grad_norm": 0.22549092769622803, "learning_rate": 0.001, "loss": 1.9805, "step": 224800 }, { "epoch": 13.154354565128385, "grad_norm": 0.26403647661209106, "learning_rate": 0.001, "loss": 1.9735, "step": 224900 }, { "epoch": 13.160203544481488, "grad_norm": 0.21778357028961182, "learning_rate": 0.001, "loss": 1.9701, "step": 225000 }, { "epoch": 13.160203544481488, "eval_ag_news_accuracy": 0.236515625, "eval_ag_news_bleu_score": 6.17196921066404, "eval_ag_news_bleu_score_sem": 0.44963582885768816, "eval_ag_news_emb_cos_sim": 0.6869640350341797, "eval_ag_news_emb_cos_sim_sem": 0.015663418918848038, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.762662887573242, "eval_ag_news_n_ngrams_match_1": 12.828125, "eval_ag_news_n_ngrams_match_2": 3.640625, "eval_ag_news_n_ngrams_match_3": 1.3125, "eval_ag_news_num_pred_words": 45.7734375, "eval_ag_news_num_true_words": 45.4296875, "eval_ag_news_perplexity": 15.8419722215709, "eval_ag_news_pred_num_tokens": 72.890625, "eval_ag_news_rouge_score": 0.2688865314683392, "eval_ag_news_runtime": 39.5728, "eval_ag_news_samples_per_second": 12.635, "eval_ag_news_steps_per_second": 0.025, "eval_ag_news_token_set_f1": 0.3089889654554189, "eval_ag_news_token_set_f1_sem": 0.009965491224987938, "eval_ag_news_token_set_precision": 0.2884475834853711, "eval_ag_news_token_set_recall": 0.34672498958972797, "eval_ag_news_true_num_tokens": 63.8359375, "step": 225000 }, { "epoch": 13.160203544481488, "eval_anthropic_toxic_prompts_accuracy": 0.10065625, "eval_anthropic_toxic_prompts_bleu_score": 44.7046201332265, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.658882989807237, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8854397535324097, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.0107105178758502, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1484375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0315484639796987, "eval_anthropic_toxic_prompts_loss": 1.3306206464767456, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.796875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.6484375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.71875, "eval_anthropic_toxic_prompts_num_pred_words": 13.78125, "eval_anthropic_toxic_prompts_num_true_words": 13.71875, "eval_anthropic_toxic_prompts_perplexity": 3.7833908072092095, "eval_anthropic_toxic_prompts_pred_num_tokens": 17.4765625, "eval_anthropic_toxic_prompts_rouge_score": 0.6969926503152207, "eval_anthropic_toxic_prompts_runtime": 29.5103, "eval_anthropic_toxic_prompts_samples_per_second": 16.943, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.7127467135120026, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018154005044470862, "eval_anthropic_toxic_prompts_token_set_precision": 0.7133390167804808, "eval_anthropic_toxic_prompts_token_set_recall": 0.719809524029453, "eval_anthropic_toxic_prompts_true_num_tokens": 16.8515625, "step": 225000 }, { "epoch": 13.160203544481488, "eval_arxiv_accuracy": 0.374140625, "eval_arxiv_bleu_score": 1.564270921056867, "eval_arxiv_bleu_score_sem": 0.13731973237710332, "eval_arxiv_emb_cos_sim": 0.39945048093795776, "eval_arxiv_emb_cos_sim_sem": 0.017508739605545998, "eval_arxiv_emb_top1_equal": 0.8984375, "eval_arxiv_emb_top1_equal_sem": 0.026804566383361816, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.453461170196533, "eval_arxiv_n_ngrams_match_1": 12.0234375, "eval_arxiv_n_ngrams_match_2": 1.96875, "eval_arxiv_n_ngrams_match_3": 0.3671875, "eval_arxiv_num_pred_words": 53.3359375, "eval_arxiv_num_true_words": 85.921875, "eval_arxiv_perplexity": 31.60960942827327, "eval_arxiv_pred_num_tokens": 125.5703125, "eval_arxiv_rouge_score": 0.16208976661198993, "eval_arxiv_runtime": 31.6522, "eval_arxiv_samples_per_second": 15.797, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.16334777260529332, "eval_arxiv_token_set_f1_sem": 0.008145822097213042, "eval_arxiv_token_set_precision": 0.1088182194781907, "eval_arxiv_token_set_recall": 0.3993301172475302, "eval_arxiv_true_num_tokens": 125.5234375, "step": 225000 }, { "epoch": 13.160203544481488, "eval_python_code_alpaca_accuracy": 0.12721875, "eval_python_code_alpaca_bleu_score": 23.963756882357305, "eval_python_code_alpaca_bleu_score_sem": 1.5754538977990944, "eval_python_code_alpaca_emb_cos_sim": 0.8518620729446411, "eval_python_code_alpaca_emb_cos_sim_sem": 0.0100317457690835, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5371242761611938, "eval_python_code_alpaca_n_ngrams_match_1": 10.1875, "eval_python_code_alpaca_n_ngrams_match_2": 5.1953125, "eval_python_code_alpaca_n_ngrams_match_3": 2.7421875, "eval_python_code_alpaca_num_pred_words": 18.7890625, "eval_python_code_alpaca_num_true_words": 20.3359375, "eval_python_code_alpaca_perplexity": 4.651195466657661, "eval_python_code_alpaca_pred_num_tokens": 25.0078125, "eval_python_code_alpaca_rouge_score": 0.5481853347550603, "eval_python_code_alpaca_runtime": 30.1667, "eval_python_code_alpaca_samples_per_second": 16.575, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.5715759635366062, "eval_python_code_alpaca_token_set_f1_sem": 0.014280382877980325, "eval_python_code_alpaca_token_set_precision": 0.5562399682829827, "eval_python_code_alpaca_token_set_recall": 0.5971745231199707, "eval_python_code_alpaca_true_num_tokens": 25.7890625, "step": 225000 }, { "epoch": 13.160203544481488, "eval_wikibio_accuracy": 0.36125, "eval_wikibio_bleu_score": 6.999257018429898, "eval_wikibio_bleu_score_sem": 0.6818654790612579, "eval_wikibio_emb_cos_sim": 0.5741959810256958, "eval_wikibio_emb_cos_sim_sem": 0.022624924778938293, "eval_wikibio_emb_top1_equal": 0.921875, "eval_wikibio_emb_top1_equal_sem": 0.023813825100660324, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.774052143096924, "eval_wikibio_n_ngrams_match_1": 15.1328125, "eval_wikibio_n_ngrams_match_2": 4.8828125, "eval_wikibio_n_ngrams_match_3": 1.921875, "eval_wikibio_num_pred_words": 51.109375, "eval_wikibio_num_true_words": 52.9609375, "eval_wikibio_perplexity": 16.023431874879684, "eval_wikibio_pred_num_tokens": 103.0078125, "eval_wikibio_rouge_score": 0.30140186259223367, "eval_wikibio_runtime": 31.0162, "eval_wikibio_samples_per_second": 16.121, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32192907372960333, "eval_wikibio_token_set_f1_sem": 0.012590325907744932, "eval_wikibio_token_set_precision": 0.27988761402063556, "eval_wikibio_token_set_recall": 0.4183384147385186, "eval_wikibio_true_num_tokens": 100.7265625, "step": 225000 }, { "epoch": 13.160203544481488, "eval_msmarco_accuracy": 0.38278125, "eval_msmarco_bleu_score": 14.786269601221292, "eval_msmarco_bleu_score_sem": 1.301431073375026, "eval_msmarco_emb_cos_sim": 0.7630429267883301, "eval_msmarco_emb_cos_sim_sem": 0.018126230686903, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.9010883569717407, "eval_msmarco_n_ngrams_match_1": 26.328125, "eval_msmarco_n_ngrams_match_2": 11.4296875, "eval_msmarco_n_ngrams_match_3": 6.125, "eval_msmarco_num_pred_words": 58.0390625, "eval_msmarco_num_true_words": 61.203125, "eval_msmarco_perplexity": 6.693175043335637, "eval_msmarco_pred_num_tokens": 81.4453125, "eval_msmarco_rouge_score": 0.4162939148181744, "eval_msmarco_runtime": 26.7305, "eval_msmarco_samples_per_second": 18.705, "eval_msmarco_steps_per_second": 0.037, "eval_msmarco_token_set_f1": 0.4386105470626568, "eval_msmarco_token_set_f1_sem": 0.013797094050921421, "eval_msmarco_token_set_precision": 0.398971226617935, "eval_msmarco_token_set_recall": 0.505618602941184, "eval_msmarco_true_num_tokens": 77.5390625, "step": 225000 }, { "epoch": 13.16605252383459, "grad_norm": 0.18000362813472748, "learning_rate": 0.001, "loss": 1.976, "step": 225100 }, { "epoch": 13.171901503187694, "grad_norm": 0.21934127807617188, "learning_rate": 0.001, "loss": 1.9743, "step": 225200 }, { "epoch": 13.177750482540796, "grad_norm": 0.26311856508255005, "learning_rate": 0.001, "loss": 1.98, "step": 225300 }, { "epoch": 13.1835994618939, "grad_norm": 0.22225667536258698, "learning_rate": 0.001, "loss": 1.9812, "step": 225400 }, { "epoch": 13.189448441247002, "grad_norm": 0.22629490494728088, "learning_rate": 0.001, "loss": 1.9781, "step": 225500 }, { "epoch": 13.195297420600106, "grad_norm": 0.18140864372253418, "learning_rate": 0.001, "loss": 1.9726, "step": 225600 }, { "epoch": 13.201146399953208, "grad_norm": 0.22062429785728455, "learning_rate": 0.001, "loss": 1.9798, "step": 225700 }, { "epoch": 13.206995379306312, "grad_norm": 0.1806700974702835, "learning_rate": 0.001, "loss": 1.9829, "step": 225800 }, { "epoch": 13.212844358659414, "grad_norm": 0.18240559101104736, "learning_rate": 0.001, "loss": 1.9794, "step": 225900 }, { "epoch": 13.218693338012518, "grad_norm": 0.18337112665176392, "learning_rate": 0.001, "loss": 1.9756, "step": 226000 }, { "epoch": 13.22454231736562, "grad_norm": 0.2869911789894104, "learning_rate": 0.001, "loss": 1.9778, "step": 226100 }, { "epoch": 13.230391296718723, "grad_norm": 0.2892746329307556, "learning_rate": 0.001, "loss": 1.9825, "step": 226200 }, { "epoch": 13.236240276071825, "grad_norm": 0.2169916182756424, "learning_rate": 0.001, "loss": 1.9736, "step": 226300 }, { "epoch": 13.24208925542493, "grad_norm": 0.29951557517051697, "learning_rate": 0.001, "loss": 1.9817, "step": 226400 }, { "epoch": 13.247938234778031, "grad_norm": 0.1797240674495697, "learning_rate": 0.001, "loss": 1.9801, "step": 226500 }, { "epoch": 13.253787214131133, "grad_norm": 0.2689734697341919, "learning_rate": 0.001, "loss": 1.9755, "step": 226600 }, { "epoch": 13.259636193484237, "grad_norm": 0.21829570829868317, "learning_rate": 0.001, "loss": 1.977, "step": 226700 }, { "epoch": 13.265485172837339, "grad_norm": 0.2302432507276535, "learning_rate": 0.001, "loss": 1.9691, "step": 226800 }, { "epoch": 13.271334152190443, "grad_norm": 0.2564280927181244, "learning_rate": 0.001, "loss": 1.9783, "step": 226900 }, { "epoch": 13.277183131543545, "grad_norm": 0.2254856526851654, "learning_rate": 0.001, "loss": 1.9781, "step": 227000 }, { "epoch": 13.283032110896649, "grad_norm": 0.23087440431118011, "learning_rate": 0.001, "loss": 1.98, "step": 227100 }, { "epoch": 13.28888109024975, "grad_norm": 0.2724143862724304, "learning_rate": 0.001, "loss": 1.979, "step": 227200 }, { "epoch": 13.294730069602855, "grad_norm": 0.3142528533935547, "learning_rate": 0.001, "loss": 1.9796, "step": 227300 }, { "epoch": 13.300579048955957, "grad_norm": 0.18945831060409546, "learning_rate": 0.001, "loss": 1.9762, "step": 227400 }, { "epoch": 13.30642802830906, "grad_norm": 0.23417599499225616, "learning_rate": 0.001, "loss": 1.9752, "step": 227500 }, { "epoch": 13.312277007662162, "grad_norm": 0.20746006071567535, "learning_rate": 0.001, "loss": 1.9833, "step": 227600 }, { "epoch": 13.318125987015266, "grad_norm": 0.243167445063591, "learning_rate": 0.001, "loss": 1.976, "step": 227700 }, { "epoch": 13.323974966368368, "grad_norm": 0.2933729887008667, "learning_rate": 0.001, "loss": 1.9823, "step": 227800 }, { "epoch": 13.329823945721472, "grad_norm": 0.2580532133579254, "learning_rate": 0.001, "loss": 1.9789, "step": 227900 }, { "epoch": 13.335672925074574, "grad_norm": 0.2155558317899704, "learning_rate": 0.001, "loss": 1.981, "step": 228000 }, { "epoch": 13.341521904427678, "grad_norm": 0.3166956901550293, "learning_rate": 0.001, "loss": 1.9849, "step": 228100 }, { "epoch": 13.34737088378078, "grad_norm": 0.2182828187942505, "learning_rate": 0.001, "loss": 1.9763, "step": 228200 }, { "epoch": 13.353219863133884, "grad_norm": 0.2046653926372528, "learning_rate": 0.001, "loss": 1.975, "step": 228300 }, { "epoch": 13.359068842486986, "grad_norm": 0.20399290323257446, "learning_rate": 0.001, "loss": 1.9751, "step": 228400 }, { "epoch": 13.36491782184009, "grad_norm": 0.25457170605659485, "learning_rate": 0.001, "loss": 1.9814, "step": 228500 }, { "epoch": 13.370766801193191, "grad_norm": 0.2212488353252411, "learning_rate": 0.001, "loss": 1.9862, "step": 228600 }, { "epoch": 13.376615780546295, "grad_norm": 0.20105138421058655, "learning_rate": 0.001, "loss": 1.9772, "step": 228700 }, { "epoch": 13.382464759899397, "grad_norm": 0.25780004262924194, "learning_rate": 0.001, "loss": 1.982, "step": 228800 }, { "epoch": 13.388313739252501, "grad_norm": 0.1935599446296692, "learning_rate": 0.001, "loss": 1.9857, "step": 228900 }, { "epoch": 13.394162718605603, "grad_norm": 0.24386084079742432, "learning_rate": 0.001, "loss": 1.9725, "step": 229000 }, { "epoch": 13.400011697958707, "grad_norm": 0.17841483652591705, "learning_rate": 0.001, "loss": 1.9763, "step": 229100 }, { "epoch": 13.405860677311809, "grad_norm": 0.18572686612606049, "learning_rate": 0.001, "loss": 1.9739, "step": 229200 }, { "epoch": 13.411709656664913, "grad_norm": 0.23428656160831451, "learning_rate": 0.001, "loss": 1.976, "step": 229300 }, { "epoch": 13.417558636018015, "grad_norm": 0.27107733488082886, "learning_rate": 0.001, "loss": 1.985, "step": 229400 }, { "epoch": 13.423407615371119, "grad_norm": 0.24615222215652466, "learning_rate": 0.001, "loss": 1.9826, "step": 229500 }, { "epoch": 13.42925659472422, "grad_norm": 0.24661624431610107, "learning_rate": 0.001, "loss": 1.9811, "step": 229600 }, { "epoch": 13.435105574077323, "grad_norm": 0.17834718525409698, "learning_rate": 0.001, "loss": 1.9737, "step": 229700 }, { "epoch": 13.440954553430426, "grad_norm": 0.19614946842193604, "learning_rate": 0.001, "loss": 1.9792, "step": 229800 }, { "epoch": 13.446803532783528, "grad_norm": 0.23444317281246185, "learning_rate": 0.001, "loss": 1.9813, "step": 229900 }, { "epoch": 13.452652512136632, "grad_norm": 0.2423752099275589, "learning_rate": 0.001, "loss": 1.9776, "step": 230000 }, { "epoch": 13.458501491489734, "grad_norm": 0.3296078145503998, "learning_rate": 0.001, "loss": 1.9774, "step": 230100 }, { "epoch": 13.464350470842838, "grad_norm": 0.20230945944786072, "learning_rate": 0.001, "loss": 1.9788, "step": 230200 }, { "epoch": 13.47019945019594, "grad_norm": 0.25731492042541504, "learning_rate": 0.001, "loss": 1.9755, "step": 230300 }, { "epoch": 13.476048429549044, "grad_norm": 0.23995602130889893, "learning_rate": 0.001, "loss": 1.9792, "step": 230400 }, { "epoch": 13.481897408902146, "grad_norm": 0.2885236144065857, "learning_rate": 0.001, "loss": 1.9772, "step": 230500 }, { "epoch": 13.48774638825525, "grad_norm": 0.26185616850852966, "learning_rate": 0.001, "loss": 1.9794, "step": 230600 }, { "epoch": 13.493595367608352, "grad_norm": 0.24679481983184814, "learning_rate": 0.001, "loss": 1.978, "step": 230700 }, { "epoch": 13.499444346961456, "grad_norm": 0.17711175978183746, "learning_rate": 0.001, "loss": 1.9758, "step": 230800 }, { "epoch": 13.505293326314558, "grad_norm": 0.19859519600868225, "learning_rate": 0.001, "loss": 1.9716, "step": 230900 }, { "epoch": 13.511142305667661, "grad_norm": 0.27562448382377625, "learning_rate": 0.001, "loss": 1.9811, "step": 231000 }, { "epoch": 13.516991285020763, "grad_norm": 0.2434300035238266, "learning_rate": 0.001, "loss": 1.9808, "step": 231100 }, { "epoch": 13.522840264373867, "grad_norm": 0.25430893898010254, "learning_rate": 0.001, "loss": 1.9833, "step": 231200 }, { "epoch": 13.52868924372697, "grad_norm": 0.42284777760505676, "learning_rate": 0.001, "loss": 1.9865, "step": 231300 }, { "epoch": 13.534538223080073, "grad_norm": 0.18188630044460297, "learning_rate": 0.001, "loss": 1.9834, "step": 231400 }, { "epoch": 13.540387202433175, "grad_norm": 0.2692151665687561, "learning_rate": 0.001, "loss": 1.9783, "step": 231500 }, { "epoch": 13.546236181786279, "grad_norm": 0.21099160611629486, "learning_rate": 0.001, "loss": 1.976, "step": 231600 }, { "epoch": 13.55208516113938, "grad_norm": 0.23653753101825714, "learning_rate": 0.001, "loss": 1.975, "step": 231700 }, { "epoch": 13.557934140492485, "grad_norm": 0.21306544542312622, "learning_rate": 0.001, "loss": 1.9743, "step": 231800 }, { "epoch": 13.563783119845587, "grad_norm": 0.1871899962425232, "learning_rate": 0.001, "loss": 1.9852, "step": 231900 }, { "epoch": 13.56963209919869, "grad_norm": 0.21989047527313232, "learning_rate": 0.001, "loss": 1.9785, "step": 232000 }, { "epoch": 13.575481078551793, "grad_norm": 0.21074099838733673, "learning_rate": 0.001, "loss": 1.977, "step": 232100 }, { "epoch": 13.581330057904896, "grad_norm": 0.2069406509399414, "learning_rate": 0.001, "loss": 1.9793, "step": 232200 }, { "epoch": 13.587179037257998, "grad_norm": 0.2484118640422821, "learning_rate": 0.001, "loss": 1.9831, "step": 232300 }, { "epoch": 13.593028016611102, "grad_norm": 0.19142894446849823, "learning_rate": 0.001, "loss": 1.9729, "step": 232400 }, { "epoch": 13.598876995964204, "grad_norm": 0.24409140646457672, "learning_rate": 0.001, "loss": 1.9772, "step": 232500 }, { "epoch": 13.604725975317308, "grad_norm": 0.2699277102947235, "learning_rate": 0.001, "loss": 1.9885, "step": 232600 }, { "epoch": 13.61057495467041, "grad_norm": 0.3107939064502716, "learning_rate": 0.001, "loss": 1.9861, "step": 232700 }, { "epoch": 13.616423934023512, "grad_norm": 0.22788769006729126, "learning_rate": 0.001, "loss": 1.9772, "step": 232800 }, { "epoch": 13.622272913376616, "grad_norm": 0.2329796701669693, "learning_rate": 0.001, "loss": 1.9802, "step": 232900 }, { "epoch": 13.628121892729718, "grad_norm": 0.21963298320770264, "learning_rate": 0.001, "loss": 1.9798, "step": 233000 }, { "epoch": 13.633970872082822, "grad_norm": 0.2187013030052185, "learning_rate": 0.001, "loss": 1.9805, "step": 233100 }, { "epoch": 13.639819851435924, "grad_norm": 0.31553494930267334, "learning_rate": 0.001, "loss": 1.9766, "step": 233200 }, { "epoch": 13.645668830789027, "grad_norm": 0.2591012716293335, "learning_rate": 0.001, "loss": 1.9867, "step": 233300 }, { "epoch": 13.65151781014213, "grad_norm": 0.22272373735904694, "learning_rate": 0.001, "loss": 1.9793, "step": 233400 }, { "epoch": 13.657366789495233, "grad_norm": 0.2195117026567459, "learning_rate": 0.001, "loss": 1.9817, "step": 233500 }, { "epoch": 13.663215768848335, "grad_norm": 0.26841244101524353, "learning_rate": 0.001, "loss": 1.978, "step": 233600 }, { "epoch": 13.66906474820144, "grad_norm": 0.18519963324069977, "learning_rate": 0.001, "loss": 1.9785, "step": 233700 }, { "epoch": 13.674913727554541, "grad_norm": 0.23841778934001923, "learning_rate": 0.001, "loss": 1.9828, "step": 233800 }, { "epoch": 13.680762706907645, "grad_norm": 0.22297212481498718, "learning_rate": 0.001, "loss": 1.9782, "step": 233900 }, { "epoch": 13.686611686260747, "grad_norm": 0.3149520754814148, "learning_rate": 0.001, "loss": 1.983, "step": 234000 }, { "epoch": 13.69246066561385, "grad_norm": 0.20052683353424072, "learning_rate": 0.001, "loss": 1.9708, "step": 234100 }, { "epoch": 13.698309644966953, "grad_norm": 0.2523972690105438, "learning_rate": 0.001, "loss": 1.9843, "step": 234200 }, { "epoch": 13.704158624320057, "grad_norm": 0.2204774171113968, "learning_rate": 0.001, "loss": 1.9832, "step": 234300 }, { "epoch": 13.710007603673159, "grad_norm": 0.21496403217315674, "learning_rate": 0.001, "loss": 1.9793, "step": 234400 }, { "epoch": 13.715856583026262, "grad_norm": 0.20935972034931183, "learning_rate": 0.001, "loss": 1.9842, "step": 234500 }, { "epoch": 13.721705562379364, "grad_norm": 0.1611945629119873, "learning_rate": 0.001, "loss": 1.9786, "step": 234600 }, { "epoch": 13.727554541732468, "grad_norm": 0.1832074373960495, "learning_rate": 0.001, "loss": 1.9777, "step": 234700 }, { "epoch": 13.73340352108557, "grad_norm": 0.375221312046051, "learning_rate": 0.001, "loss": 1.9825, "step": 234800 }, { "epoch": 13.739252500438674, "grad_norm": 0.33708828687667847, "learning_rate": 0.001, "loss": 1.9756, "step": 234900 }, { "epoch": 13.745101479791776, "grad_norm": 0.2591401934623718, "learning_rate": 0.001, "loss": 1.974, "step": 235000 }, { "epoch": 13.75095045914488, "grad_norm": 0.19159042835235596, "learning_rate": 0.001, "loss": 1.9764, "step": 235100 }, { "epoch": 13.756799438497982, "grad_norm": 0.25648340582847595, "learning_rate": 0.001, "loss": 1.9789, "step": 235200 }, { "epoch": 13.762648417851086, "grad_norm": 0.22792404890060425, "learning_rate": 0.001, "loss": 1.9905, "step": 235300 }, { "epoch": 13.768497397204188, "grad_norm": 0.2643321752548218, "learning_rate": 0.001, "loss": 1.9751, "step": 235400 }, { "epoch": 13.774346376557292, "grad_norm": 0.1535710096359253, "learning_rate": 0.001, "loss": 1.9741, "step": 235500 }, { "epoch": 13.780195355910394, "grad_norm": 0.17890501022338867, "learning_rate": 0.001, "loss": 1.9797, "step": 235600 }, { "epoch": 13.786044335263497, "grad_norm": 0.21138638257980347, "learning_rate": 0.001, "loss": 1.976, "step": 235700 }, { "epoch": 13.7918933146166, "grad_norm": 0.28156718611717224, "learning_rate": 0.001, "loss": 1.9875, "step": 235800 }, { "epoch": 13.797742293969701, "grad_norm": 0.1960635483264923, "learning_rate": 0.001, "loss": 1.9784, "step": 235900 }, { "epoch": 13.803591273322805, "grad_norm": 0.25211068987846375, "learning_rate": 0.001, "loss": 1.9796, "step": 236000 }, { "epoch": 13.809440252675907, "grad_norm": 0.20620541274547577, "learning_rate": 0.001, "loss": 1.9786, "step": 236100 }, { "epoch": 13.815289232029011, "grad_norm": 0.19074097275733948, "learning_rate": 0.001, "loss": 1.9818, "step": 236200 }, { "epoch": 13.821138211382113, "grad_norm": 0.16489678621292114, "learning_rate": 0.001, "loss": 1.9816, "step": 236300 }, { "epoch": 13.826987190735217, "grad_norm": 0.18464569747447968, "learning_rate": 0.001, "loss": 1.9808, "step": 236400 }, { "epoch": 13.832836170088319, "grad_norm": 0.20405010879039764, "learning_rate": 0.001, "loss": 1.9778, "step": 236500 }, { "epoch": 13.838685149441423, "grad_norm": 0.21497565507888794, "learning_rate": 0.001, "loss": 1.981, "step": 236600 }, { "epoch": 13.844534128794525, "grad_norm": 0.2657020688056946, "learning_rate": 0.001, "loss": 1.9761, "step": 236700 }, { "epoch": 13.850383108147629, "grad_norm": 0.272628515958786, "learning_rate": 0.001, "loss": 1.9846, "step": 236800 }, { "epoch": 13.85623208750073, "grad_norm": 0.27056393027305603, "learning_rate": 0.001, "loss": 1.9809, "step": 236900 }, { "epoch": 13.862081066853834, "grad_norm": 0.2091963291168213, "learning_rate": 0.001, "loss": 1.977, "step": 237000 }, { "epoch": 13.867930046206936, "grad_norm": 0.20770621299743652, "learning_rate": 0.001, "loss": 1.9825, "step": 237100 }, { "epoch": 13.87377902556004, "grad_norm": 0.20477941632270813, "learning_rate": 0.001, "loss": 1.979, "step": 237200 }, { "epoch": 13.879628004913142, "grad_norm": 0.1808488667011261, "learning_rate": 0.001, "loss": 1.9774, "step": 237300 }, { "epoch": 13.885476984266246, "grad_norm": 0.24005036056041718, "learning_rate": 0.001, "loss": 1.9744, "step": 237400 }, { "epoch": 13.891325963619348, "grad_norm": 0.2497682422399521, "learning_rate": 0.001, "loss": 1.9798, "step": 237500 }, { "epoch": 13.897174942972452, "grad_norm": 0.25223663449287415, "learning_rate": 0.001, "loss": 1.9773, "step": 237600 }, { "epoch": 13.903023922325554, "grad_norm": 0.20614300668239594, "learning_rate": 0.001, "loss": 1.9854, "step": 237700 }, { "epoch": 13.908872901678658, "grad_norm": 0.29770612716674805, "learning_rate": 0.001, "loss": 1.9818, "step": 237800 }, { "epoch": 13.91472188103176, "grad_norm": 0.1801588237285614, "learning_rate": 0.001, "loss": 1.9881, "step": 237900 }, { "epoch": 13.920570860384863, "grad_norm": 0.2041631042957306, "learning_rate": 0.001, "loss": 1.9783, "step": 238000 }, { "epoch": 13.926419839737965, "grad_norm": 0.2253776490688324, "learning_rate": 0.001, "loss": 1.9738, "step": 238100 }, { "epoch": 13.93226881909107, "grad_norm": 0.1858462542295456, "learning_rate": 0.001, "loss": 1.9768, "step": 238200 }, { "epoch": 13.938117798444171, "grad_norm": 0.16976016759872437, "learning_rate": 0.001, "loss": 1.9743, "step": 238300 }, { "epoch": 13.943966777797275, "grad_norm": 0.33217376470565796, "learning_rate": 0.001, "loss": 1.9787, "step": 238400 }, { "epoch": 13.949815757150377, "grad_norm": 0.21794497966766357, "learning_rate": 0.001, "loss": 1.9749, "step": 238500 }, { "epoch": 13.955664736503481, "grad_norm": 0.241551011800766, "learning_rate": 0.001, "loss": 1.9729, "step": 238600 }, { "epoch": 13.961513715856583, "grad_norm": 0.27513033151626587, "learning_rate": 0.001, "loss": 1.9823, "step": 238700 }, { "epoch": 13.967362695209687, "grad_norm": 0.27918586134910583, "learning_rate": 0.001, "loss": 1.9813, "step": 238800 }, { "epoch": 13.973211674562789, "grad_norm": 0.23931871354579926, "learning_rate": 0.001, "loss": 1.9801, "step": 238900 }, { "epoch": 13.97906065391589, "grad_norm": 0.3112543225288391, "learning_rate": 0.001, "loss": 1.9858, "step": 239000 }, { "epoch": 13.984909633268995, "grad_norm": 0.2306748628616333, "learning_rate": 0.001, "loss": 1.9872, "step": 239100 }, { "epoch": 13.990758612622097, "grad_norm": 0.18557582795619965, "learning_rate": 0.001, "loss": 1.9847, "step": 239200 }, { "epoch": 13.9966075919752, "grad_norm": 0.18405134975910187, "learning_rate": 0.001, "loss": 1.9762, "step": 239300 }, { "epoch": 14.002456571328302, "grad_norm": 0.18319526314735413, "learning_rate": 0.001, "loss": 1.9775, "step": 239400 }, { "epoch": 14.008305550681406, "grad_norm": 0.2694479525089264, "learning_rate": 0.001, "loss": 1.9607, "step": 239500 }, { "epoch": 14.014154530034508, "grad_norm": 0.20103605091571808, "learning_rate": 0.001, "loss": 1.9617, "step": 239600 }, { "epoch": 14.020003509387612, "grad_norm": 0.20391428470611572, "learning_rate": 0.001, "loss": 1.9643, "step": 239700 }, { "epoch": 14.025852488740714, "grad_norm": 0.22883525490760803, "learning_rate": 0.001, "loss": 1.9652, "step": 239800 }, { "epoch": 14.031701468093818, "grad_norm": 0.22937746345996857, "learning_rate": 0.001, "loss": 1.9688, "step": 239900 }, { "epoch": 14.03755044744692, "grad_norm": 0.2870284914970398, "learning_rate": 0.001, "loss": 1.9632, "step": 240000 }, { "epoch": 14.043399426800024, "grad_norm": 0.2622302770614624, "learning_rate": 0.001, "loss": 1.962, "step": 240100 }, { "epoch": 14.049248406153126, "grad_norm": 0.28526049852371216, "learning_rate": 0.001, "loss": 1.9615, "step": 240200 }, { "epoch": 14.05509738550623, "grad_norm": 0.28516653180122375, "learning_rate": 0.001, "loss": 1.9649, "step": 240300 }, { "epoch": 14.060946364859332, "grad_norm": 0.25162068009376526, "learning_rate": 0.001, "loss": 1.9636, "step": 240400 }, { "epoch": 14.066795344212435, "grad_norm": 0.24342043697834015, "learning_rate": 0.001, "loss": 1.9671, "step": 240500 }, { "epoch": 14.072644323565537, "grad_norm": 0.18953265249729156, "learning_rate": 0.001, "loss": 1.9673, "step": 240600 }, { "epoch": 14.078493302918641, "grad_norm": 0.22126680612564087, "learning_rate": 0.001, "loss": 1.9643, "step": 240700 }, { "epoch": 14.084342282271743, "grad_norm": 0.23117570579051971, "learning_rate": 0.001, "loss": 1.9685, "step": 240800 }, { "epoch": 14.090191261624847, "grad_norm": 0.2592628002166748, "learning_rate": 0.001, "loss": 1.9645, "step": 240900 }, { "epoch": 14.096040240977949, "grad_norm": 0.1811443269252777, "learning_rate": 0.001, "loss": 1.9561, "step": 241000 }, { "epoch": 14.101889220331053, "grad_norm": 0.20813414454460144, "learning_rate": 0.001, "loss": 1.9613, "step": 241100 }, { "epoch": 14.107738199684155, "grad_norm": 0.28780272603034973, "learning_rate": 0.001, "loss": 1.967, "step": 241200 }, { "epoch": 14.113587179037259, "grad_norm": 0.24872490763664246, "learning_rate": 0.001, "loss": 1.9713, "step": 241300 }, { "epoch": 14.11943615839036, "grad_norm": 0.26449623703956604, "learning_rate": 0.001, "loss": 1.962, "step": 241400 }, { "epoch": 14.125285137743465, "grad_norm": 0.298951655626297, "learning_rate": 0.001, "loss": 1.9621, "step": 241500 }, { "epoch": 14.131134117096567, "grad_norm": 0.26754069328308105, "learning_rate": 0.001, "loss": 1.9679, "step": 241600 }, { "epoch": 14.13698309644967, "grad_norm": 0.29970309138298035, "learning_rate": 0.001, "loss": 1.9646, "step": 241700 }, { "epoch": 14.142832075802772, "grad_norm": 0.2209417074918747, "learning_rate": 0.001, "loss": 1.9694, "step": 241800 }, { "epoch": 14.148681055155876, "grad_norm": 0.22292257845401764, "learning_rate": 0.001, "loss": 1.9665, "step": 241900 }, { "epoch": 14.154530034508978, "grad_norm": 0.34647122025489807, "learning_rate": 0.001, "loss": 1.9638, "step": 242000 }, { "epoch": 14.16037901386208, "grad_norm": 0.19242088496685028, "learning_rate": 0.001, "loss": 1.9626, "step": 242100 }, { "epoch": 14.166227993215184, "grad_norm": 0.21566060185432434, "learning_rate": 0.001, "loss": 1.9699, "step": 242200 }, { "epoch": 14.172076972568286, "grad_norm": 0.1802053600549698, "learning_rate": 0.001, "loss": 1.9596, "step": 242300 }, { "epoch": 14.17792595192139, "grad_norm": 0.2340971827507019, "learning_rate": 0.001, "loss": 1.9655, "step": 242400 }, { "epoch": 14.183774931274492, "grad_norm": 0.17252445220947266, "learning_rate": 0.001, "loss": 1.9647, "step": 242500 }, { "epoch": 14.189623910627596, "grad_norm": 0.25843000411987305, "learning_rate": 0.001, "loss": 1.9676, "step": 242600 }, { "epoch": 14.195472889980698, "grad_norm": 0.22856535017490387, "learning_rate": 0.001, "loss": 1.9698, "step": 242700 }, { "epoch": 14.201321869333801, "grad_norm": 0.24771948158740997, "learning_rate": 0.001, "loss": 1.9665, "step": 242800 }, { "epoch": 14.207170848686903, "grad_norm": 0.27833253145217896, "learning_rate": 0.001, "loss": 1.9675, "step": 242900 }, { "epoch": 14.213019828040007, "grad_norm": 0.2763110101222992, "learning_rate": 0.001, "loss": 1.9711, "step": 243000 }, { "epoch": 14.21886880739311, "grad_norm": 0.2269279956817627, "learning_rate": 0.001, "loss": 1.9699, "step": 243100 }, { "epoch": 14.224717786746213, "grad_norm": 0.21824385225772858, "learning_rate": 0.001, "loss": 1.9712, "step": 243200 }, { "epoch": 14.230566766099315, "grad_norm": 0.23626267910003662, "learning_rate": 0.001, "loss": 1.9606, "step": 243300 }, { "epoch": 14.236415745452419, "grad_norm": 0.24778316915035248, "learning_rate": 0.001, "loss": 1.968, "step": 243400 }, { "epoch": 14.242264724805521, "grad_norm": 0.3011469542980194, "learning_rate": 0.001, "loss": 1.9718, "step": 243500 }, { "epoch": 14.248113704158625, "grad_norm": 0.20394834876060486, "learning_rate": 0.001, "loss": 1.9725, "step": 243600 }, { "epoch": 14.253962683511727, "grad_norm": 0.21683980524539948, "learning_rate": 0.001, "loss": 1.9656, "step": 243700 }, { "epoch": 14.25981166286483, "grad_norm": 0.253658264875412, "learning_rate": 0.001, "loss": 1.9673, "step": 243800 }, { "epoch": 14.265660642217933, "grad_norm": 0.27220797538757324, "learning_rate": 0.001, "loss": 1.9656, "step": 243900 }, { "epoch": 14.271509621571036, "grad_norm": 0.23035696148872375, "learning_rate": 0.001, "loss": 1.9684, "step": 244000 }, { "epoch": 14.277358600924138, "grad_norm": 0.19970186054706573, "learning_rate": 0.001, "loss": 1.969, "step": 244100 }, { "epoch": 14.283207580277242, "grad_norm": 0.20713543891906738, "learning_rate": 0.001, "loss": 1.9662, "step": 244200 }, { "epoch": 14.289056559630344, "grad_norm": 0.20946021378040314, "learning_rate": 0.001, "loss": 1.97, "step": 244300 }, { "epoch": 14.294905538983448, "grad_norm": 0.2905365824699402, "learning_rate": 0.001, "loss": 1.9707, "step": 244400 }, { "epoch": 14.30075451833655, "grad_norm": 0.2528438866138458, "learning_rate": 0.001, "loss": 1.9668, "step": 244500 }, { "epoch": 14.306603497689654, "grad_norm": 0.2482183575630188, "learning_rate": 0.001, "loss": 1.964, "step": 244600 }, { "epoch": 14.312452477042756, "grad_norm": 0.2525128126144409, "learning_rate": 0.001, "loss": 1.9756, "step": 244700 }, { "epoch": 14.31830145639586, "grad_norm": 0.23711328208446503, "learning_rate": 0.001, "loss": 1.9684, "step": 244800 }, { "epoch": 14.324150435748962, "grad_norm": 0.2340916097164154, "learning_rate": 0.001, "loss": 1.9712, "step": 244900 }, { "epoch": 14.329999415102066, "grad_norm": 0.25383511185646057, "learning_rate": 0.001, "loss": 1.9689, "step": 245000 }, { "epoch": 14.335848394455168, "grad_norm": 0.24396270513534546, "learning_rate": 0.001, "loss": 1.9662, "step": 245100 }, { "epoch": 14.34169737380827, "grad_norm": 0.26016977429389954, "learning_rate": 0.001, "loss": 1.9748, "step": 245200 }, { "epoch": 14.347546353161373, "grad_norm": 0.2611865699291229, "learning_rate": 0.001, "loss": 1.9675, "step": 245300 }, { "epoch": 14.353395332514475, "grad_norm": 0.2523117959499359, "learning_rate": 0.001, "loss": 1.9678, "step": 245400 }, { "epoch": 14.35924431186758, "grad_norm": 0.23650695383548737, "learning_rate": 0.001, "loss": 1.9655, "step": 245500 }, { "epoch": 14.365093291220681, "grad_norm": 0.2713968753814697, "learning_rate": 0.001, "loss": 1.9684, "step": 245600 }, { "epoch": 14.370942270573785, "grad_norm": 0.17855042219161987, "learning_rate": 0.001, "loss": 1.9685, "step": 245700 }, { "epoch": 14.376791249926887, "grad_norm": 0.22937370836734772, "learning_rate": 0.001, "loss": 1.9679, "step": 245800 }, { "epoch": 14.38264022927999, "grad_norm": 0.2672974765300751, "learning_rate": 0.001, "loss": 1.9684, "step": 245900 }, { "epoch": 14.388489208633093, "grad_norm": 0.23349754512310028, "learning_rate": 0.001, "loss": 1.9769, "step": 246000 }, { "epoch": 14.394338187986197, "grad_norm": 0.19294412434101105, "learning_rate": 0.001, "loss": 1.9659, "step": 246100 }, { "epoch": 14.400187167339299, "grad_norm": 0.24998711049556732, "learning_rate": 0.001, "loss": 1.965, "step": 246200 }, { "epoch": 14.406036146692403, "grad_norm": 0.2150680273771286, "learning_rate": 0.001, "loss": 1.966, "step": 246300 }, { "epoch": 14.411885126045505, "grad_norm": 0.2255224585533142, "learning_rate": 0.001, "loss": 1.9624, "step": 246400 }, { "epoch": 14.417734105398608, "grad_norm": 0.30600741505622864, "learning_rate": 0.001, "loss": 1.9652, "step": 246500 }, { "epoch": 14.42358308475171, "grad_norm": 0.18810750544071198, "learning_rate": 0.001, "loss": 1.9702, "step": 246600 }, { "epoch": 14.429432064104814, "grad_norm": 0.16686172783374786, "learning_rate": 0.001, "loss": 1.9661, "step": 246700 }, { "epoch": 14.435281043457916, "grad_norm": 0.24224071204662323, "learning_rate": 0.001, "loss": 1.9726, "step": 246800 }, { "epoch": 14.44113002281102, "grad_norm": 0.22119352221488953, "learning_rate": 0.001, "loss": 1.9733, "step": 246900 }, { "epoch": 14.446979002164122, "grad_norm": 0.20119157433509827, "learning_rate": 0.001, "loss": 1.9703, "step": 247000 }, { "epoch": 14.452827981517226, "grad_norm": 0.20621377229690552, "learning_rate": 0.001, "loss": 1.9616, "step": 247100 }, { "epoch": 14.458676960870328, "grad_norm": 0.19879889488220215, "learning_rate": 0.001, "loss": 1.9709, "step": 247200 }, { "epoch": 14.464525940223432, "grad_norm": 0.20266245305538177, "learning_rate": 0.001, "loss": 1.9677, "step": 247300 }, { "epoch": 14.470374919576534, "grad_norm": 0.24737124145030975, "learning_rate": 0.001, "loss": 1.9657, "step": 247400 }, { "epoch": 14.476223898929637, "grad_norm": 0.24154464900493622, "learning_rate": 0.001, "loss": 1.968, "step": 247500 }, { "epoch": 14.48207287828274, "grad_norm": 0.29823237657546997, "learning_rate": 0.001, "loss": 1.9698, "step": 247600 }, { "epoch": 14.487921857635843, "grad_norm": 0.2436269223690033, "learning_rate": 0.001, "loss": 1.9763, "step": 247700 }, { "epoch": 14.493770836988945, "grad_norm": 0.17624056339263916, "learning_rate": 0.001, "loss": 1.97, "step": 247800 }, { "epoch": 14.499619816342049, "grad_norm": 0.2583642303943634, "learning_rate": 0.001, "loss": 1.9655, "step": 247900 }, { "epoch": 14.505468795695151, "grad_norm": 0.23084022104740143, "learning_rate": 0.001, "loss": 1.9636, "step": 248000 }, { "epoch": 14.511317775048255, "grad_norm": 0.25879910588264465, "learning_rate": 0.001, "loss": 1.9688, "step": 248100 }, { "epoch": 14.517166754401357, "grad_norm": 0.2502914071083069, "learning_rate": 0.001, "loss": 1.9691, "step": 248200 }, { "epoch": 14.523015733754459, "grad_norm": 0.22740785777568817, "learning_rate": 0.001, "loss": 1.9731, "step": 248300 }, { "epoch": 14.528864713107563, "grad_norm": 0.2173127979040146, "learning_rate": 0.001, "loss": 1.966, "step": 248400 }, { "epoch": 14.534713692460665, "grad_norm": 0.19417347013950348, "learning_rate": 0.001, "loss": 1.9672, "step": 248500 }, { "epoch": 14.540562671813769, "grad_norm": 0.2560805678367615, "learning_rate": 0.001, "loss": 1.9709, "step": 248600 }, { "epoch": 14.54641165116687, "grad_norm": 0.25292032957077026, "learning_rate": 0.001, "loss": 1.9694, "step": 248700 }, { "epoch": 14.552260630519974, "grad_norm": 0.18949897587299347, "learning_rate": 0.001, "loss": 1.9651, "step": 248800 }, { "epoch": 14.558109609873076, "grad_norm": 0.20750831067562103, "learning_rate": 0.001, "loss": 1.9647, "step": 248900 }, { "epoch": 14.56395858922618, "grad_norm": 0.21664410829544067, "learning_rate": 0.001, "loss": 1.9654, "step": 249000 }, { "epoch": 14.569807568579282, "grad_norm": 0.297671377658844, "learning_rate": 0.001, "loss": 1.9668, "step": 249100 }, { "epoch": 14.575656547932386, "grad_norm": 0.1997825652360916, "learning_rate": 0.001, "loss": 1.9756, "step": 249200 }, { "epoch": 14.581505527285488, "grad_norm": 0.26294997334480286, "learning_rate": 0.001, "loss": 1.9716, "step": 249300 }, { "epoch": 14.587354506638592, "grad_norm": 0.19298474490642548, "learning_rate": 0.001, "loss": 1.9668, "step": 249400 }, { "epoch": 14.593203485991694, "grad_norm": 0.282370924949646, "learning_rate": 0.001, "loss": 1.9625, "step": 249500 }, { "epoch": 14.599052465344798, "grad_norm": 0.2085217833518982, "learning_rate": 0.001, "loss": 1.9727, "step": 249600 }, { "epoch": 14.6049014446979, "grad_norm": 0.30055704712867737, "learning_rate": 0.001, "loss": 1.9666, "step": 249700 }, { "epoch": 14.610750424051004, "grad_norm": 0.2088634967803955, "learning_rate": 0.001, "loss": 1.9653, "step": 249800 }, { "epoch": 14.616599403404106, "grad_norm": 0.2309078425168991, "learning_rate": 0.001, "loss": 1.9697, "step": 249900 }, { "epoch": 14.62244838275721, "grad_norm": 0.1842934638261795, "learning_rate": 0.001, "loss": 1.9638, "step": 250000 }, { "epoch": 14.62244838275721, "eval_ag_news_accuracy": 0.234890625, "eval_ag_news_bleu_score": 6.498486765883003, "eval_ag_news_bleu_score_sem": 0.42467936413504104, "eval_ag_news_emb_cos_sim": 0.6772029995918274, "eval_ag_news_emb_cos_sim_sem": 0.015383739955723286, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.777961015701294, "eval_ag_news_n_ngrams_match_1": 12.96875, "eval_ag_news_n_ngrams_match_2": 3.6328125, "eval_ag_news_n_ngrams_match_3": 1.4453125, "eval_ag_news_num_pred_words": 46.0859375, "eval_ag_news_num_true_words": 44.1328125, "eval_ag_news_perplexity": 16.086188001710145, "eval_ag_news_pred_num_tokens": 70.3359375, "eval_ag_news_rouge_score": 0.27785323899663716, "eval_ag_news_runtime": 38.7338, "eval_ag_news_samples_per_second": 12.909, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.31399472958385877, "eval_ag_news_token_set_f1_sem": 0.0108480011676767, "eval_ag_news_token_set_precision": 0.29715205378378845, "eval_ag_news_token_set_recall": 0.34404635247349646, "eval_ag_news_true_num_tokens": 62.0234375, "step": 250000 }, { "epoch": 14.62244838275721, "eval_anthropic_toxic_prompts_accuracy": 0.101671875, "eval_anthropic_toxic_prompts_bleu_score": 37.876629564448045, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.4925344022398987, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.886345386505127, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008726435713469982, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1015625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02680456515850638, "eval_anthropic_toxic_prompts_loss": 1.3001242876052856, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.3984375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 4.9375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.1015625, "eval_anthropic_toxic_prompts_num_pred_words": 13.8359375, "eval_anthropic_toxic_prompts_num_true_words": 13.6484375, "eval_anthropic_toxic_prompts_perplexity": 3.6697527440568822, "eval_anthropic_toxic_prompts_pred_num_tokens": 17.546875, "eval_anthropic_toxic_prompts_rouge_score": 0.6555443048326626, "eval_anthropic_toxic_prompts_runtime": 29.7265, "eval_anthropic_toxic_prompts_samples_per_second": 16.82, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6745052353473135, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017790446837685825, "eval_anthropic_toxic_prompts_token_set_precision": 0.6788495798695494, "eval_anthropic_toxic_prompts_token_set_recall": 0.6763996588915061, "eval_anthropic_toxic_prompts_true_num_tokens": 16.765625, "step": 250000 }, { "epoch": 14.62244838275721, "eval_arxiv_accuracy": 0.366796875, "eval_arxiv_bleu_score": 1.6228597473489064, "eval_arxiv_bleu_score_sem": 0.15106702616175596, "eval_arxiv_emb_cos_sim": 0.4353369474411011, "eval_arxiv_emb_cos_sim_sem": 0.019160669296979904, "eval_arxiv_emb_top1_equal": 0.890625, "eval_arxiv_emb_top1_equal_sem": 0.02769520878791809, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4896962642669678, "eval_arxiv_n_ngrams_match_1": 12.4375, "eval_arxiv_n_ngrams_match_2": 2.21875, "eval_arxiv_n_ngrams_match_3": 0.4765625, "eval_arxiv_num_pred_words": 51.90625, "eval_arxiv_num_true_words": 84.96875, "eval_arxiv_perplexity": 32.77599095455936, "eval_arxiv_pred_num_tokens": 125.8515625, "eval_arxiv_rouge_score": 0.16483262329544063, "eval_arxiv_runtime": 30.5629, "eval_arxiv_samples_per_second": 16.36, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.16708717169616463, "eval_arxiv_token_set_f1_sem": 0.00888278440898558, "eval_arxiv_token_set_precision": 0.1131859498534452, "eval_arxiv_token_set_recall": 0.4060385986844934, "eval_arxiv_true_num_tokens": 123.1796875, "step": 250000 }, { "epoch": 14.62244838275721, "eval_python_code_alpaca_accuracy": 0.126109375, "eval_python_code_alpaca_bleu_score": 23.768178067018333, "eval_python_code_alpaca_bleu_score_sem": 1.5690316809354332, "eval_python_code_alpaca_emb_cos_sim": 0.8465983867645264, "eval_python_code_alpaca_emb_cos_sim_sem": 0.010079991072416306, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5684688091278076, "eval_python_code_alpaca_n_ngrams_match_1": 10.265625, "eval_python_code_alpaca_n_ngrams_match_2": 5.2421875, "eval_python_code_alpaca_n_ngrams_match_3": 2.78125, "eval_python_code_alpaca_num_pred_words": 18.484375, "eval_python_code_alpaca_num_true_words": 19.75, "eval_python_code_alpaca_perplexity": 4.79929392976286, "eval_python_code_alpaca_pred_num_tokens": 24.828125, "eval_python_code_alpaca_rouge_score": 0.5594349685148288, "eval_python_code_alpaca_runtime": 29.724, "eval_python_code_alpaca_samples_per_second": 16.821, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.5812394578063776, "eval_python_code_alpaca_token_set_f1_sem": 0.013350974759833437, "eval_python_code_alpaca_token_set_precision": 0.5627463411430778, "eval_python_code_alpaca_token_set_recall": 0.6091594611809797, "eval_python_code_alpaca_true_num_tokens": 26.3125, "step": 250000 }, { "epoch": 14.62244838275721, "eval_wikibio_accuracy": 0.355265625, "eval_wikibio_bleu_score": 6.077290923451214, "eval_wikibio_bleu_score_sem": 0.6206453776669864, "eval_wikibio_emb_cos_sim": 0.5661208033561707, "eval_wikibio_emb_cos_sim_sem": 0.02499762736260891, "eval_wikibio_emb_top1_equal": 0.8515625, "eval_wikibio_emb_top1_equal_sem": 0.03154846653342247, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7564871311187744, "eval_wikibio_n_ngrams_match_1": 12.5390625, "eval_wikibio_n_ngrams_match_2": 4.1015625, "eval_wikibio_n_ngrams_match_3": 1.6953125, "eval_wikibio_num_pred_words": 49.921875, "eval_wikibio_num_true_words": 51.546875, "eval_wikibio_perplexity": 15.744437544574787, "eval_wikibio_pred_num_tokens": 107.90625, "eval_wikibio_rouge_score": 0.2471619965149385, "eval_wikibio_runtime": 30.5624, "eval_wikibio_samples_per_second": 16.36, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2792750232409622, "eval_wikibio_token_set_f1_sem": 0.014101591479479404, "eval_wikibio_token_set_precision": 0.2416148053020468, "eval_wikibio_token_set_recall": 0.3844939727029203, "eval_wikibio_true_num_tokens": 100.2109375, "step": 250000 }, { "epoch": 14.62244838275721, "eval_msmarco_accuracy": 0.384546875, "eval_msmarco_bleu_score": 13.931444150224348, "eval_msmarco_bleu_score_sem": 1.2427267834515814, "eval_msmarco_emb_cos_sim": 0.7731488347053528, "eval_msmarco_emb_cos_sim_sem": 0.015359154902398586, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8242413997650146, "eval_msmarco_n_ngrams_match_1": 26.6796875, "eval_msmarco_n_ngrams_match_2": 11.71875, "eval_msmarco_n_ngrams_match_3": 6.4453125, "eval_msmarco_num_pred_words": 63.34375, "eval_msmarco_num_true_words": 63.3046875, "eval_msmarco_perplexity": 6.198091361669194, "eval_msmarco_pred_num_tokens": 87.03125, "eval_msmarco_rouge_score": 0.39452740222698335, "eval_msmarco_runtime": 25.4003, "eval_msmarco_samples_per_second": 19.685, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4227198280681431, "eval_msmarco_token_set_f1_sem": 0.013509870513996762, "eval_msmarco_token_set_precision": 0.38712166179298524, "eval_msmarco_token_set_recall": 0.48865235627015147, "eval_msmarco_true_num_tokens": 81.90625, "step": 250000 }, { "epoch": 14.628297362110311, "grad_norm": 0.27425628900527954, "learning_rate": 0.001, "loss": 1.9685, "step": 250100 }, { "epoch": 14.634146341463415, "grad_norm": 0.28877371549606323, "learning_rate": 0.001, "loss": 1.9756, "step": 250200 }, { "epoch": 14.639995320816517, "grad_norm": 0.2675548195838928, "learning_rate": 0.001, "loss": 1.9708, "step": 250300 }, { "epoch": 14.645844300169621, "grad_norm": 0.2118586152791977, "learning_rate": 0.001, "loss": 1.9699, "step": 250400 }, { "epoch": 14.651693279522723, "grad_norm": 0.2042689174413681, "learning_rate": 0.001, "loss": 1.9677, "step": 250500 }, { "epoch": 14.657542258875827, "grad_norm": 0.22809909284114838, "learning_rate": 0.001, "loss": 1.963, "step": 250600 }, { "epoch": 14.663391238228929, "grad_norm": 0.23814690113067627, "learning_rate": 0.001, "loss": 1.9723, "step": 250700 }, { "epoch": 14.669240217582033, "grad_norm": 0.23318862915039062, "learning_rate": 0.001, "loss": 1.9703, "step": 250800 }, { "epoch": 14.675089196935135, "grad_norm": 0.24017928540706635, "learning_rate": 0.001, "loss": 1.9647, "step": 250900 }, { "epoch": 14.680938176288238, "grad_norm": 0.2090958058834076, "learning_rate": 0.001, "loss": 1.9729, "step": 251000 }, { "epoch": 14.68678715564134, "grad_norm": 0.15921002626419067, "learning_rate": 0.001, "loss": 1.9684, "step": 251100 }, { "epoch": 14.692636134994444, "grad_norm": 0.25061503052711487, "learning_rate": 0.001, "loss": 1.9701, "step": 251200 }, { "epoch": 14.698485114347546, "grad_norm": 0.3292587697505951, "learning_rate": 0.001, "loss": 1.9762, "step": 251300 }, { "epoch": 14.704334093700648, "grad_norm": 0.2586134076118469, "learning_rate": 0.001, "loss": 1.9685, "step": 251400 }, { "epoch": 14.710183073053752, "grad_norm": 0.23291458189487457, "learning_rate": 0.001, "loss": 1.9664, "step": 251500 }, { "epoch": 14.716032052406854, "grad_norm": 0.19936180114746094, "learning_rate": 0.001, "loss": 1.9639, "step": 251600 }, { "epoch": 14.721881031759958, "grad_norm": 0.21073633432388306, "learning_rate": 0.001, "loss": 1.9682, "step": 251700 }, { "epoch": 14.72773001111306, "grad_norm": 0.20967675745487213, "learning_rate": 0.001, "loss": 1.9722, "step": 251800 }, { "epoch": 14.733578990466164, "grad_norm": 0.26561346650123596, "learning_rate": 0.001, "loss": 1.963, "step": 251900 }, { "epoch": 14.739427969819266, "grad_norm": 0.20049498975276947, "learning_rate": 0.001, "loss": 1.9743, "step": 252000 }, { "epoch": 14.74527694917237, "grad_norm": 0.1979212611913681, "learning_rate": 0.001, "loss": 1.9722, "step": 252100 }, { "epoch": 14.751125928525472, "grad_norm": 0.24769864976406097, "learning_rate": 0.001, "loss": 1.9719, "step": 252200 }, { "epoch": 14.756974907878575, "grad_norm": 0.32003793120384216, "learning_rate": 0.001, "loss": 1.9736, "step": 252300 }, { "epoch": 14.762823887231677, "grad_norm": 0.2135101705789566, "learning_rate": 0.001, "loss": 1.9721, "step": 252400 }, { "epoch": 14.768672866584781, "grad_norm": 0.20969155430793762, "learning_rate": 0.001, "loss": 1.9686, "step": 252500 }, { "epoch": 14.774521845937883, "grad_norm": 0.20636795461177826, "learning_rate": 0.001, "loss": 1.9704, "step": 252600 }, { "epoch": 14.780370825290987, "grad_norm": 0.2298334389925003, "learning_rate": 0.001, "loss": 1.978, "step": 252700 }, { "epoch": 14.78621980464409, "grad_norm": 0.2764069437980652, "learning_rate": 0.001, "loss": 1.9731, "step": 252800 }, { "epoch": 14.792068783997193, "grad_norm": 0.2130342274904251, "learning_rate": 0.001, "loss": 1.97, "step": 252900 }, { "epoch": 14.797917763350295, "grad_norm": 0.2609170973300934, "learning_rate": 0.001, "loss": 1.966, "step": 253000 }, { "epoch": 14.803766742703399, "grad_norm": 0.21177765727043152, "learning_rate": 0.001, "loss": 1.9629, "step": 253100 }, { "epoch": 14.8096157220565, "grad_norm": 0.19876885414123535, "learning_rate": 0.001, "loss": 1.9661, "step": 253200 }, { "epoch": 14.815464701409605, "grad_norm": 0.25627654790878296, "learning_rate": 0.001, "loss": 1.9736, "step": 253300 }, { "epoch": 14.821313680762707, "grad_norm": 0.21709460020065308, "learning_rate": 0.001, "loss": 1.9662, "step": 253400 }, { "epoch": 14.82716266011581, "grad_norm": 0.20118048787117004, "learning_rate": 0.001, "loss": 1.9669, "step": 253500 }, { "epoch": 14.833011639468912, "grad_norm": 0.27324187755584717, "learning_rate": 0.001, "loss": 1.9737, "step": 253600 }, { "epoch": 14.838860618822016, "grad_norm": 0.22182808816432953, "learning_rate": 0.001, "loss": 1.9707, "step": 253700 }, { "epoch": 14.844709598175118, "grad_norm": 0.20790088176727295, "learning_rate": 0.001, "loss": 1.968, "step": 253800 }, { "epoch": 14.850558577528222, "grad_norm": 0.16716419160366058, "learning_rate": 0.001, "loss": 1.9687, "step": 253900 }, { "epoch": 14.856407556881324, "grad_norm": 0.33260577917099, "learning_rate": 0.001, "loss": 1.9682, "step": 254000 }, { "epoch": 14.862256536234428, "grad_norm": 0.19898882508277893, "learning_rate": 0.001, "loss": 1.9686, "step": 254100 }, { "epoch": 14.86810551558753, "grad_norm": 0.22846923768520355, "learning_rate": 0.001, "loss": 1.9652, "step": 254200 }, { "epoch": 14.873954494940634, "grad_norm": 0.2001464068889618, "learning_rate": 0.001, "loss": 1.9706, "step": 254300 }, { "epoch": 14.879803474293736, "grad_norm": 0.22551709413528442, "learning_rate": 0.001, "loss": 1.9677, "step": 254400 }, { "epoch": 14.885652453646838, "grad_norm": 0.24258673191070557, "learning_rate": 0.001, "loss": 1.9733, "step": 254500 }, { "epoch": 14.891501432999942, "grad_norm": 0.21538107097148895, "learning_rate": 0.001, "loss": 1.976, "step": 254600 }, { "epoch": 14.897350412353044, "grad_norm": 0.17323555052280426, "learning_rate": 0.001, "loss": 1.969, "step": 254700 }, { "epoch": 14.903199391706147, "grad_norm": 0.2351198047399521, "learning_rate": 0.001, "loss": 1.9624, "step": 254800 }, { "epoch": 14.90904837105925, "grad_norm": 0.22755157947540283, "learning_rate": 0.001, "loss": 1.967, "step": 254900 }, { "epoch": 14.914897350412353, "grad_norm": 0.22144076228141785, "learning_rate": 0.001, "loss": 1.9665, "step": 255000 }, { "epoch": 14.920746329765455, "grad_norm": 0.2332659810781479, "learning_rate": 0.001, "loss": 1.9658, "step": 255100 }, { "epoch": 14.926595309118559, "grad_norm": 0.27159759402275085, "learning_rate": 0.001, "loss": 1.9674, "step": 255200 }, { "epoch": 14.932444288471661, "grad_norm": 0.2609262764453888, "learning_rate": 0.001, "loss": 1.966, "step": 255300 }, { "epoch": 14.938293267824765, "grad_norm": 0.1985473483800888, "learning_rate": 0.001, "loss": 1.9722, "step": 255400 }, { "epoch": 14.944142247177867, "grad_norm": 0.1858747899532318, "learning_rate": 0.001, "loss": 1.9708, "step": 255500 }, { "epoch": 14.94999122653097, "grad_norm": 0.2299753725528717, "learning_rate": 0.001, "loss": 1.9678, "step": 255600 }, { "epoch": 14.955840205884073, "grad_norm": 0.21365107595920563, "learning_rate": 0.001, "loss": 1.9717, "step": 255700 }, { "epoch": 14.961689185237177, "grad_norm": 0.25365275144577026, "learning_rate": 0.001, "loss": 1.9633, "step": 255800 }, { "epoch": 14.967538164590279, "grad_norm": 0.2897484302520752, "learning_rate": 0.001, "loss": 1.9695, "step": 255900 }, { "epoch": 14.973387143943382, "grad_norm": 0.1828140765428543, "learning_rate": 0.001, "loss": 1.9729, "step": 256000 }, { "epoch": 14.979236123296484, "grad_norm": 0.2925907075405121, "learning_rate": 0.001, "loss": 1.9732, "step": 256100 }, { "epoch": 14.985085102649588, "grad_norm": 0.24347858130931854, "learning_rate": 0.001, "loss": 1.9665, "step": 256200 }, { "epoch": 14.99093408200269, "grad_norm": 0.223737433552742, "learning_rate": 0.001, "loss": 1.9648, "step": 256300 }, { "epoch": 14.996783061355794, "grad_norm": 0.25504645705223083, "learning_rate": 0.001, "loss": 1.9662, "step": 256400 }, { "epoch": 15.002632040708896, "grad_norm": 0.22147981822490692, "learning_rate": 0.001, "loss": 1.9663, "step": 256500 }, { "epoch": 15.008481020062, "grad_norm": 0.24946321547031403, "learning_rate": 0.001, "loss": 1.9553, "step": 256600 }, { "epoch": 15.014329999415102, "grad_norm": 0.24803322553634644, "learning_rate": 0.001, "loss": 1.9555, "step": 256700 }, { "epoch": 15.020178978768206, "grad_norm": 0.2361343801021576, "learning_rate": 0.001, "loss": 1.9533, "step": 256800 }, { "epoch": 15.026027958121308, "grad_norm": 0.20841069519519806, "learning_rate": 0.001, "loss": 1.9516, "step": 256900 }, { "epoch": 15.031876937474411, "grad_norm": 0.23341284692287445, "learning_rate": 0.001, "loss": 1.9629, "step": 257000 }, { "epoch": 15.037725916827513, "grad_norm": 0.21816353499889374, "learning_rate": 0.001, "loss": 1.9495, "step": 257100 }, { "epoch": 15.043574896180617, "grad_norm": 0.2215181291103363, "learning_rate": 0.001, "loss": 1.9488, "step": 257200 }, { "epoch": 15.04942387553372, "grad_norm": 0.1986027956008911, "learning_rate": 0.001, "loss": 1.9527, "step": 257300 }, { "epoch": 15.055272854886823, "grad_norm": 0.29324236512184143, "learning_rate": 0.001, "loss": 1.9514, "step": 257400 }, { "epoch": 15.061121834239925, "grad_norm": 0.2643941640853882, "learning_rate": 0.001, "loss": 1.957, "step": 257500 }, { "epoch": 15.066970813593027, "grad_norm": 0.21345831453800201, "learning_rate": 0.001, "loss": 1.9618, "step": 257600 }, { "epoch": 15.072819792946131, "grad_norm": 0.1851228028535843, "learning_rate": 0.001, "loss": 1.9542, "step": 257700 }, { "epoch": 15.078668772299233, "grad_norm": 0.28422582149505615, "learning_rate": 0.001, "loss": 1.9514, "step": 257800 }, { "epoch": 15.084517751652337, "grad_norm": 0.1818804293870926, "learning_rate": 0.001, "loss": 1.9587, "step": 257900 }, { "epoch": 15.090366731005439, "grad_norm": 0.23703233897686005, "learning_rate": 0.001, "loss": 1.9541, "step": 258000 }, { "epoch": 15.096215710358543, "grad_norm": 0.22809487581253052, "learning_rate": 0.001, "loss": 1.9576, "step": 258100 }, { "epoch": 15.102064689711645, "grad_norm": 0.26587021350860596, "learning_rate": 0.001, "loss": 1.9568, "step": 258200 }, { "epoch": 15.107913669064748, "grad_norm": 0.18658319115638733, "learning_rate": 0.001, "loss": 1.9544, "step": 258300 }, { "epoch": 15.11376264841785, "grad_norm": 0.2105674147605896, "learning_rate": 0.001, "loss": 1.9562, "step": 258400 }, { "epoch": 15.119611627770954, "grad_norm": 0.28780314326286316, "learning_rate": 0.001, "loss": 1.9591, "step": 258500 }, { "epoch": 15.125460607124056, "grad_norm": 0.21428292989730835, "learning_rate": 0.001, "loss": 1.9584, "step": 258600 }, { "epoch": 15.13130958647716, "grad_norm": 0.2277550995349884, "learning_rate": 0.001, "loss": 1.9534, "step": 258700 }, { "epoch": 15.137158565830262, "grad_norm": 0.3089875280857086, "learning_rate": 0.001, "loss": 1.957, "step": 258800 }, { "epoch": 15.143007545183366, "grad_norm": 0.23050200939178467, "learning_rate": 0.001, "loss": 1.9576, "step": 258900 }, { "epoch": 15.148856524536468, "grad_norm": 0.25895851850509644, "learning_rate": 0.001, "loss": 1.9546, "step": 259000 }, { "epoch": 15.154705503889572, "grad_norm": 0.16655218601226807, "learning_rate": 0.001, "loss": 1.9503, "step": 259100 }, { "epoch": 15.160554483242674, "grad_norm": 0.2355329692363739, "learning_rate": 0.001, "loss": 1.9511, "step": 259200 }, { "epoch": 15.166403462595778, "grad_norm": 0.25971752405166626, "learning_rate": 0.001, "loss": 1.9638, "step": 259300 }, { "epoch": 15.17225244194888, "grad_norm": 0.21498103439807892, "learning_rate": 0.001, "loss": 1.953, "step": 259400 }, { "epoch": 15.178101421301983, "grad_norm": 0.25893011689186096, "learning_rate": 0.001, "loss": 1.9572, "step": 259500 }, { "epoch": 15.183950400655085, "grad_norm": 0.22079885005950928, "learning_rate": 0.001, "loss": 1.9525, "step": 259600 }, { "epoch": 15.18979938000819, "grad_norm": 0.23287825286388397, "learning_rate": 0.001, "loss": 1.9532, "step": 259700 }, { "epoch": 15.195648359361291, "grad_norm": 0.21729974448680878, "learning_rate": 0.001, "loss": 1.957, "step": 259800 }, { "epoch": 15.201497338714395, "grad_norm": 0.23515468835830688, "learning_rate": 0.001, "loss": 1.9511, "step": 259900 }, { "epoch": 15.207346318067497, "grad_norm": 0.21208947896957397, "learning_rate": 0.001, "loss": 1.9582, "step": 260000 }, { "epoch": 15.2131952974206, "grad_norm": 0.19089972972869873, "learning_rate": 0.001, "loss": 1.9545, "step": 260100 }, { "epoch": 15.219044276773703, "grad_norm": 0.301233172416687, "learning_rate": 0.001, "loss": 1.9574, "step": 260200 }, { "epoch": 15.224893256126807, "grad_norm": 0.2256106436252594, "learning_rate": 0.001, "loss": 1.962, "step": 260300 }, { "epoch": 15.230742235479909, "grad_norm": 0.16770634055137634, "learning_rate": 0.001, "loss": 1.9569, "step": 260400 }, { "epoch": 15.236591214833012, "grad_norm": 0.2519789934158325, "learning_rate": 0.001, "loss": 1.9596, "step": 260500 }, { "epoch": 15.242440194186115, "grad_norm": 0.22102241218090057, "learning_rate": 0.001, "loss": 1.9499, "step": 260600 }, { "epoch": 15.248289173539217, "grad_norm": 0.19765639305114746, "learning_rate": 0.001, "loss": 1.9554, "step": 260700 }, { "epoch": 15.25413815289232, "grad_norm": 0.22404569387435913, "learning_rate": 0.001, "loss": 1.9581, "step": 260800 }, { "epoch": 15.259987132245422, "grad_norm": 0.22371289134025574, "learning_rate": 0.001, "loss": 1.9583, "step": 260900 }, { "epoch": 15.265836111598526, "grad_norm": 0.23874172568321228, "learning_rate": 0.001, "loss": 1.962, "step": 261000 }, { "epoch": 15.271685090951628, "grad_norm": 0.2421940416097641, "learning_rate": 0.001, "loss": 1.9524, "step": 261100 }, { "epoch": 15.277534070304732, "grad_norm": 0.24645960330963135, "learning_rate": 0.001, "loss": 1.9627, "step": 261200 }, { "epoch": 15.283383049657834, "grad_norm": 0.16430503129959106, "learning_rate": 0.001, "loss": 1.9601, "step": 261300 }, { "epoch": 15.289232029010938, "grad_norm": 0.21487198770046234, "learning_rate": 0.001, "loss": 1.9539, "step": 261400 }, { "epoch": 15.29508100836404, "grad_norm": 0.21747243404388428, "learning_rate": 0.001, "loss": 1.9557, "step": 261500 }, { "epoch": 15.300929987717144, "grad_norm": 0.24126477539539337, "learning_rate": 0.001, "loss": 1.9596, "step": 261600 }, { "epoch": 15.306778967070246, "grad_norm": 0.20239749550819397, "learning_rate": 0.001, "loss": 1.9584, "step": 261700 }, { "epoch": 15.31262794642335, "grad_norm": 0.2088182121515274, "learning_rate": 0.001, "loss": 1.96, "step": 261800 }, { "epoch": 15.318476925776451, "grad_norm": 0.29906952381134033, "learning_rate": 0.001, "loss": 1.9587, "step": 261900 }, { "epoch": 15.324325905129555, "grad_norm": 0.23409686982631683, "learning_rate": 0.001, "loss": 1.9589, "step": 262000 }, { "epoch": 15.330174884482657, "grad_norm": 0.23842673003673553, "learning_rate": 0.001, "loss": 1.9542, "step": 262100 }, { "epoch": 15.336023863835761, "grad_norm": 0.2111084759235382, "learning_rate": 0.001, "loss": 1.9591, "step": 262200 }, { "epoch": 15.341872843188863, "grad_norm": 0.26502010226249695, "learning_rate": 0.001, "loss": 1.9641, "step": 262300 }, { "epoch": 15.347721822541967, "grad_norm": 0.1983301341533661, "learning_rate": 0.001, "loss": 1.954, "step": 262400 }, { "epoch": 15.353570801895069, "grad_norm": 0.1991146057844162, "learning_rate": 0.001, "loss": 1.9601, "step": 262500 }, { "epoch": 15.359419781248173, "grad_norm": 0.18505315482616425, "learning_rate": 0.001, "loss": 1.9547, "step": 262600 }, { "epoch": 15.365268760601275, "grad_norm": 0.24875737726688385, "learning_rate": 0.001, "loss": 1.9578, "step": 262700 }, { "epoch": 15.371117739954379, "grad_norm": 0.2640834450721741, "learning_rate": 0.001, "loss": 1.9614, "step": 262800 }, { "epoch": 15.37696671930748, "grad_norm": 0.24885545670986176, "learning_rate": 0.001, "loss": 1.9559, "step": 262900 }, { "epoch": 15.382815698660584, "grad_norm": 0.18513792753219604, "learning_rate": 0.001, "loss": 1.956, "step": 263000 }, { "epoch": 15.388664678013686, "grad_norm": 0.2264518141746521, "learning_rate": 0.001, "loss": 1.9608, "step": 263100 }, { "epoch": 15.39451365736679, "grad_norm": 0.2808336019515991, "learning_rate": 0.001, "loss": 1.9654, "step": 263200 }, { "epoch": 15.400362636719892, "grad_norm": 0.2301323115825653, "learning_rate": 0.001, "loss": 1.9555, "step": 263300 }, { "epoch": 15.406211616072996, "grad_norm": 0.17883464694023132, "learning_rate": 0.001, "loss": 1.9527, "step": 263400 }, { "epoch": 15.412060595426098, "grad_norm": 0.2344859540462494, "learning_rate": 0.001, "loss": 1.9594, "step": 263500 }, { "epoch": 15.417909574779202, "grad_norm": 0.22445669770240784, "learning_rate": 0.001, "loss": 1.9612, "step": 263600 }, { "epoch": 15.423758554132304, "grad_norm": 0.2783609926700592, "learning_rate": 0.001, "loss": 1.9651, "step": 263700 }, { "epoch": 15.429607533485406, "grad_norm": 0.3003453016281128, "learning_rate": 0.001, "loss": 1.9582, "step": 263800 }, { "epoch": 15.43545651283851, "grad_norm": 0.24643449485301971, "learning_rate": 0.001, "loss": 1.9566, "step": 263900 }, { "epoch": 15.441305492191612, "grad_norm": 0.22894933819770813, "learning_rate": 0.001, "loss": 1.9542, "step": 264000 }, { "epoch": 15.447154471544716, "grad_norm": 0.20408478379249573, "learning_rate": 0.001, "loss": 1.9597, "step": 264100 }, { "epoch": 15.453003450897818, "grad_norm": 0.22383162379264832, "learning_rate": 0.001, "loss": 1.9561, "step": 264200 }, { "epoch": 15.458852430250921, "grad_norm": 0.2596486806869507, "learning_rate": 0.001, "loss": 1.9672, "step": 264300 }, { "epoch": 15.464701409604023, "grad_norm": 0.22894328832626343, "learning_rate": 0.001, "loss": 1.9572, "step": 264400 }, { "epoch": 15.470550388957127, "grad_norm": 0.23095077276229858, "learning_rate": 0.001, "loss": 1.9527, "step": 264500 }, { "epoch": 15.47639936831023, "grad_norm": 0.2585543394088745, "learning_rate": 0.001, "loss": 1.9561, "step": 264600 }, { "epoch": 15.482248347663333, "grad_norm": 0.22969470918178558, "learning_rate": 0.001, "loss": 1.9601, "step": 264700 }, { "epoch": 15.488097327016435, "grad_norm": 0.22285422682762146, "learning_rate": 0.001, "loss": 1.9587, "step": 264800 }, { "epoch": 15.493946306369539, "grad_norm": 0.2219599038362503, "learning_rate": 0.001, "loss": 1.9583, "step": 264900 }, { "epoch": 15.49979528572264, "grad_norm": 0.23696967959403992, "learning_rate": 0.001, "loss": 1.9591, "step": 265000 }, { "epoch": 15.505644265075745, "grad_norm": 0.2065456360578537, "learning_rate": 0.001, "loss": 1.9563, "step": 265100 }, { "epoch": 15.511493244428847, "grad_norm": 0.18845897912979126, "learning_rate": 0.001, "loss": 1.9617, "step": 265200 }, { "epoch": 15.51734222378195, "grad_norm": 0.194162055850029, "learning_rate": 0.001, "loss": 1.958, "step": 265300 }, { "epoch": 15.523191203135053, "grad_norm": 0.2279660999774933, "learning_rate": 0.001, "loss": 1.9612, "step": 265400 }, { "epoch": 15.529040182488156, "grad_norm": 0.19233183562755585, "learning_rate": 0.001, "loss": 1.9582, "step": 265500 }, { "epoch": 15.534889161841258, "grad_norm": 0.2108229398727417, "learning_rate": 0.001, "loss": 1.9603, "step": 265600 }, { "epoch": 15.540738141194362, "grad_norm": 0.2668645679950714, "learning_rate": 0.001, "loss": 1.9581, "step": 265700 }, { "epoch": 15.546587120547464, "grad_norm": 0.2317768782377243, "learning_rate": 0.001, "loss": 1.9599, "step": 265800 }, { "epoch": 15.552436099900568, "grad_norm": 0.1978548765182495, "learning_rate": 0.001, "loss": 1.9563, "step": 265900 }, { "epoch": 15.55828507925367, "grad_norm": 0.21475912630558014, "learning_rate": 0.001, "loss": 1.9587, "step": 266000 }, { "epoch": 15.564134058606774, "grad_norm": 0.2079111933708191, "learning_rate": 0.001, "loss": 1.9634, "step": 266100 }, { "epoch": 15.569983037959876, "grad_norm": 0.17239941656589508, "learning_rate": 0.001, "loss": 1.9567, "step": 266200 }, { "epoch": 15.57583201731298, "grad_norm": 0.29607996344566345, "learning_rate": 0.001, "loss": 1.9656, "step": 266300 }, { "epoch": 15.581680996666082, "grad_norm": 0.18370147049427032, "learning_rate": 0.001, "loss": 1.9652, "step": 266400 }, { "epoch": 15.587529976019185, "grad_norm": 0.2231922745704651, "learning_rate": 0.001, "loss": 1.9586, "step": 266500 }, { "epoch": 15.593378955372287, "grad_norm": 0.17706501483917236, "learning_rate": 0.001, "loss": 1.9605, "step": 266600 }, { "epoch": 15.599227934725391, "grad_norm": 0.23015044629573822, "learning_rate": 0.001, "loss": 1.9506, "step": 266700 }, { "epoch": 15.605076914078493, "grad_norm": 0.2363358438014984, "learning_rate": 0.001, "loss": 1.959, "step": 266800 }, { "epoch": 15.610925893431595, "grad_norm": 0.2222004532814026, "learning_rate": 0.001, "loss": 1.9571, "step": 266900 }, { "epoch": 15.6167748727847, "grad_norm": 0.21743933856487274, "learning_rate": 0.001, "loss": 1.9612, "step": 267000 }, { "epoch": 15.622623852137801, "grad_norm": 0.42748168110847473, "learning_rate": 0.001, "loss": 1.9596, "step": 267100 }, { "epoch": 15.628472831490905, "grad_norm": 0.27018123865127563, "learning_rate": 0.001, "loss": 1.9545, "step": 267200 }, { "epoch": 15.634321810844007, "grad_norm": 0.3302766978740692, "learning_rate": 0.001, "loss": 1.9615, "step": 267300 }, { "epoch": 15.64017079019711, "grad_norm": 0.2128361463546753, "learning_rate": 0.001, "loss": 1.966, "step": 267400 }, { "epoch": 15.646019769550213, "grad_norm": 0.24593156576156616, "learning_rate": 0.001, "loss": 1.9573, "step": 267500 }, { "epoch": 15.651868748903317, "grad_norm": 0.19361968338489532, "learning_rate": 0.001, "loss": 1.9587, "step": 267600 }, { "epoch": 15.657717728256419, "grad_norm": 0.2209482192993164, "learning_rate": 0.001, "loss": 1.961, "step": 267700 }, { "epoch": 15.663566707609522, "grad_norm": 0.22701643407344818, "learning_rate": 0.001, "loss": 1.9595, "step": 267800 }, { "epoch": 15.669415686962624, "grad_norm": 0.23972535133361816, "learning_rate": 0.001, "loss": 1.9665, "step": 267900 }, { "epoch": 15.675264666315728, "grad_norm": 0.28422799706459045, "learning_rate": 0.001, "loss": 1.9581, "step": 268000 }, { "epoch": 15.68111364566883, "grad_norm": 0.1924351453781128, "learning_rate": 0.001, "loss": 1.9543, "step": 268100 }, { "epoch": 15.686962625021934, "grad_norm": 0.17921961843967438, "learning_rate": 0.001, "loss": 1.9538, "step": 268200 }, { "epoch": 15.692811604375036, "grad_norm": 0.21632267534732819, "learning_rate": 0.001, "loss": 1.9553, "step": 268300 }, { "epoch": 15.69866058372814, "grad_norm": 0.2610253691673279, "learning_rate": 0.001, "loss": 1.9666, "step": 268400 }, { "epoch": 15.704509563081242, "grad_norm": 0.23588353395462036, "learning_rate": 0.001, "loss": 1.9687, "step": 268500 }, { "epoch": 15.710358542434346, "grad_norm": 0.2184315323829651, "learning_rate": 0.001, "loss": 1.9595, "step": 268600 }, { "epoch": 15.716207521787448, "grad_norm": 0.21137680113315582, "learning_rate": 0.001, "loss": 1.9669, "step": 268700 }, { "epoch": 15.722056501140552, "grad_norm": 0.22723546624183655, "learning_rate": 0.001, "loss": 1.9571, "step": 268800 }, { "epoch": 15.727905480493654, "grad_norm": 0.25294923782348633, "learning_rate": 0.001, "loss": 1.9612, "step": 268900 }, { "epoch": 15.733754459846757, "grad_norm": 0.22334547340869904, "learning_rate": 0.001, "loss": 1.9661, "step": 269000 }, { "epoch": 15.73960343919986, "grad_norm": 0.2087256908416748, "learning_rate": 0.001, "loss": 1.9558, "step": 269100 }, { "epoch": 15.745452418552963, "grad_norm": 0.21041417121887207, "learning_rate": 0.001, "loss": 1.9579, "step": 269200 }, { "epoch": 15.751301397906065, "grad_norm": 0.19367845356464386, "learning_rate": 0.001, "loss": 1.9548, "step": 269300 }, { "epoch": 15.757150377259169, "grad_norm": 0.2763846218585968, "learning_rate": 0.001, "loss": 1.9546, "step": 269400 }, { "epoch": 15.762999356612271, "grad_norm": 0.18300019204616547, "learning_rate": 0.001, "loss": 1.9582, "step": 269500 }, { "epoch": 15.768848335965375, "grad_norm": 0.2156713604927063, "learning_rate": 0.001, "loss": 1.9584, "step": 269600 }, { "epoch": 15.774697315318477, "grad_norm": 0.19602100551128387, "learning_rate": 0.001, "loss": 1.9512, "step": 269700 }, { "epoch": 15.78054629467158, "grad_norm": 0.2247847318649292, "learning_rate": 0.001, "loss": 1.9618, "step": 269800 }, { "epoch": 15.786395274024683, "grad_norm": 0.2143876552581787, "learning_rate": 0.001, "loss": 1.9609, "step": 269900 }, { "epoch": 15.792244253377785, "grad_norm": 0.2523854374885559, "learning_rate": 0.001, "loss": 1.9552, "step": 270000 }, { "epoch": 15.798093232730889, "grad_norm": 0.33115074038505554, "learning_rate": 0.001, "loss": 1.9533, "step": 270100 }, { "epoch": 15.80394221208399, "grad_norm": 0.25585681200027466, "learning_rate": 0.001, "loss": 1.9638, "step": 270200 }, { "epoch": 15.809791191437094, "grad_norm": 0.19301773607730865, "learning_rate": 0.001, "loss": 1.958, "step": 270300 }, { "epoch": 15.815640170790196, "grad_norm": 0.24051471054553986, "learning_rate": 0.001, "loss": 1.9594, "step": 270400 }, { "epoch": 15.8214891501433, "grad_norm": 0.20138023793697357, "learning_rate": 0.001, "loss": 1.9621, "step": 270500 }, { "epoch": 15.827338129496402, "grad_norm": 0.22156234085559845, "learning_rate": 0.001, "loss": 1.9623, "step": 270600 }, { "epoch": 15.833187108849506, "grad_norm": 0.20112906396389008, "learning_rate": 0.001, "loss": 1.9568, "step": 270700 }, { "epoch": 15.839036088202608, "grad_norm": 0.2370229810476303, "learning_rate": 0.001, "loss": 1.967, "step": 270800 }, { "epoch": 15.844885067555712, "grad_norm": 0.21079622209072113, "learning_rate": 0.001, "loss": 1.9614, "step": 270900 }, { "epoch": 15.850734046908814, "grad_norm": 0.2401232123374939, "learning_rate": 0.001, "loss": 1.956, "step": 271000 }, { "epoch": 15.856583026261918, "grad_norm": 0.20491868257522583, "learning_rate": 0.001, "loss": 1.9613, "step": 271100 }, { "epoch": 15.86243200561502, "grad_norm": 0.20856961607933044, "learning_rate": 0.001, "loss": 1.958, "step": 271200 }, { "epoch": 15.868280984968123, "grad_norm": 0.22935079038143158, "learning_rate": 0.001, "loss": 1.9581, "step": 271300 }, { "epoch": 15.874129964321225, "grad_norm": 0.23698890209197998, "learning_rate": 0.001, "loss": 1.9569, "step": 271400 }, { "epoch": 15.87997894367433, "grad_norm": 0.22814810276031494, "learning_rate": 0.001, "loss": 1.9594, "step": 271500 }, { "epoch": 15.885827923027431, "grad_norm": 0.21588826179504395, "learning_rate": 0.001, "loss": 1.958, "step": 271600 }, { "epoch": 15.891676902380535, "grad_norm": 0.1888931691646576, "learning_rate": 0.001, "loss": 1.9557, "step": 271700 }, { "epoch": 15.897525881733637, "grad_norm": 0.1777532398700714, "learning_rate": 0.001, "loss": 1.9549, "step": 271800 }, { "epoch": 15.903374861086741, "grad_norm": 0.2920968234539032, "learning_rate": 0.001, "loss": 1.9648, "step": 271900 }, { "epoch": 15.909223840439843, "grad_norm": 0.21294037997722626, "learning_rate": 0.001, "loss": 1.9637, "step": 272000 }, { "epoch": 15.915072819792947, "grad_norm": 0.23189102113246918, "learning_rate": 0.001, "loss": 1.9596, "step": 272100 }, { "epoch": 15.920921799146049, "grad_norm": 0.2007284164428711, "learning_rate": 0.001, "loss": 1.9554, "step": 272200 }, { "epoch": 15.926770778499153, "grad_norm": 0.21439017355442047, "learning_rate": 0.001, "loss": 1.9631, "step": 272300 }, { "epoch": 15.932619757852255, "grad_norm": 0.21970051527023315, "learning_rate": 0.001, "loss": 1.9654, "step": 272400 }, { "epoch": 15.938468737205358, "grad_norm": 0.19794094562530518, "learning_rate": 0.001, "loss": 1.9564, "step": 272500 }, { "epoch": 15.94431771655846, "grad_norm": 0.2948787212371826, "learning_rate": 0.001, "loss": 1.9578, "step": 272600 }, { "epoch": 15.950166695911564, "grad_norm": 0.2510412931442261, "learning_rate": 0.001, "loss": 1.9707, "step": 272700 }, { "epoch": 15.956015675264666, "grad_norm": 0.21000629663467407, "learning_rate": 0.001, "loss": 1.96, "step": 272800 }, { "epoch": 15.96186465461777, "grad_norm": 0.26883116364479065, "learning_rate": 0.001, "loss": 1.9626, "step": 272900 }, { "epoch": 15.967713633970872, "grad_norm": 0.2628576457500458, "learning_rate": 0.001, "loss": 1.9646, "step": 273000 }, { "epoch": 15.973562613323974, "grad_norm": 0.2142142653465271, "learning_rate": 0.001, "loss": 1.9527, "step": 273100 }, { "epoch": 15.979411592677078, "grad_norm": 0.21568934619426727, "learning_rate": 0.001, "loss": 1.9563, "step": 273200 }, { "epoch": 15.98526057203018, "grad_norm": 0.30624184012413025, "learning_rate": 0.001, "loss": 1.9584, "step": 273300 }, { "epoch": 15.991109551383284, "grad_norm": 0.19792328774929047, "learning_rate": 0.001, "loss": 1.9631, "step": 273400 }, { "epoch": 15.996958530736386, "grad_norm": 0.2366321086883545, "learning_rate": 0.001, "loss": 1.9564, "step": 273500 }, { "epoch": 16.00280751008949, "grad_norm": 0.23317833244800568, "learning_rate": 0.001, "loss": 1.9533, "step": 273600 }, { "epoch": 16.008656489442593, "grad_norm": 0.2358313798904419, "learning_rate": 0.001, "loss": 1.9379, "step": 273700 }, { "epoch": 16.014505468795694, "grad_norm": 0.197809636592865, "learning_rate": 0.001, "loss": 1.9386, "step": 273800 }, { "epoch": 16.020354448148797, "grad_norm": 0.2365291267633438, "learning_rate": 0.001, "loss": 1.9425, "step": 273900 }, { "epoch": 16.0262034275019, "grad_norm": 0.2346205711364746, "learning_rate": 0.001, "loss": 1.9411, "step": 274000 }, { "epoch": 16.032052406855005, "grad_norm": 0.22985966503620148, "learning_rate": 0.001, "loss": 1.944, "step": 274100 }, { "epoch": 16.037901386208105, "grad_norm": 0.18958744406700134, "learning_rate": 0.001, "loss": 1.949, "step": 274200 }, { "epoch": 16.04375036556121, "grad_norm": 0.19507081806659698, "learning_rate": 0.001, "loss": 1.945, "step": 274300 }, { "epoch": 16.049599344914313, "grad_norm": 0.17203181982040405, "learning_rate": 0.001, "loss": 1.9457, "step": 274400 }, { "epoch": 16.055448324267417, "grad_norm": 0.24464718997478485, "learning_rate": 0.001, "loss": 1.947, "step": 274500 }, { "epoch": 16.061297303620517, "grad_norm": 0.2589292824268341, "learning_rate": 0.001, "loss": 1.9447, "step": 274600 }, { "epoch": 16.06714628297362, "grad_norm": 0.2238161414861679, "learning_rate": 0.001, "loss": 1.9414, "step": 274700 }, { "epoch": 16.072995262326724, "grad_norm": 0.24830211699008942, "learning_rate": 0.001, "loss": 1.9461, "step": 274800 }, { "epoch": 16.07884424167983, "grad_norm": 0.20245158672332764, "learning_rate": 0.001, "loss": 1.9443, "step": 274900 }, { "epoch": 16.08469322103293, "grad_norm": 0.24454672634601593, "learning_rate": 0.001, "loss": 1.9481, "step": 275000 }, { "epoch": 16.08469322103293, "eval_ag_news_accuracy": 0.2363125, "eval_ag_news_bleu_score": 6.826394024962413, "eval_ag_news_bleu_score_sem": 0.4943567090561686, "eval_ag_news_emb_cos_sim": 0.6993640661239624, "eval_ag_news_emb_cos_sim_sem": 0.014510939829051495, "eval_ag_news_emb_top1_equal": 0.9375, "eval_ag_news_emb_top1_equal_sem": 0.02147948183119297, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.769883871078491, "eval_ag_news_n_ngrams_match_1": 13.7421875, "eval_ag_news_n_ngrams_match_2": 4.0625, "eval_ag_news_n_ngrams_match_3": 1.4765625, "eval_ag_news_num_pred_words": 48.296875, "eval_ag_news_num_true_words": 47.0625, "eval_ag_news_perplexity": 15.956780858442034, "eval_ag_news_pred_num_tokens": 74.6328125, "eval_ag_news_rouge_score": 0.2829921982172197, "eval_ag_news_runtime": 36.1785, "eval_ag_news_samples_per_second": 13.82, "eval_ag_news_steps_per_second": 0.028, "eval_ag_news_token_set_f1": 0.32552449888614154, "eval_ag_news_token_set_f1_sem": 0.00994057803205145, "eval_ag_news_token_set_precision": 0.3033692760845411, "eval_ag_news_token_set_recall": 0.365960525031331, "eval_ag_news_true_num_tokens": 65.6015625, "step": 275000 }, { "epoch": 16.08469322103293, "eval_anthropic_toxic_prompts_accuracy": 0.099421875, "eval_anthropic_toxic_prompts_bleu_score": 43.13226110275884, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.780592806519406, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8729642629623413, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.0121516278013587, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1640625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.032861675748836264, "eval_anthropic_toxic_prompts_loss": 1.2819868326187134, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.4453125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.265625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.5078125, "eval_anthropic_toxic_prompts_num_pred_words": 14.0390625, "eval_anthropic_toxic_prompts_num_true_words": 13.546875, "eval_anthropic_toxic_prompts_perplexity": 3.6037927502721545, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.15625, "eval_anthropic_toxic_prompts_rouge_score": 0.6746901953470172, "eval_anthropic_toxic_prompts_runtime": 27.9742, "eval_anthropic_toxic_prompts_samples_per_second": 17.874, "eval_anthropic_toxic_prompts_steps_per_second": 0.036, "eval_anthropic_toxic_prompts_token_set_f1": 0.6992673202389258, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.020462895066536355, "eval_anthropic_toxic_prompts_token_set_precision": 0.7018707518847787, "eval_anthropic_toxic_prompts_token_set_recall": 0.7027955546116846, "eval_anthropic_toxic_prompts_true_num_tokens": 16.6953125, "step": 275000 }, { "epoch": 16.08469322103293, "eval_arxiv_accuracy": 0.37209375, "eval_arxiv_bleu_score": 1.345652984815561, "eval_arxiv_bleu_score_sem": 0.1280505016895843, "eval_arxiv_emb_cos_sim": 0.4286629557609558, "eval_arxiv_emb_cos_sim_sem": 0.01968236081302166, "eval_arxiv_emb_top1_equal": 0.8515625, "eval_arxiv_emb_top1_equal_sem": 0.03154846653342247, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4668710231781006, "eval_arxiv_n_ngrams_match_1": 11.0625, "eval_arxiv_n_ngrams_match_2": 1.734375, "eval_arxiv_n_ngrams_match_3": 0.34375, "eval_arxiv_num_pred_words": 48.34375, "eval_arxiv_num_true_words": 86.3125, "eval_arxiv_perplexity": 32.0363444758831, "eval_arxiv_pred_num_tokens": 125.421875, "eval_arxiv_rouge_score": 0.14678228628130735, "eval_arxiv_runtime": 28.8133, "eval_arxiv_samples_per_second": 17.353, "eval_arxiv_steps_per_second": 0.035, "eval_arxiv_token_set_f1": 0.1454657479431017, "eval_arxiv_token_set_f1_sem": 0.00852135312403795, "eval_arxiv_token_set_precision": 0.09651490585701748, "eval_arxiv_token_set_recall": 0.3886230916595629, "eval_arxiv_true_num_tokens": 124.84375, "step": 275000 }, { "epoch": 16.08469322103293, "eval_python_code_alpaca_accuracy": 0.12834375, "eval_python_code_alpaca_bleu_score": 28.16112719231132, "eval_python_code_alpaca_bleu_score_sem": 1.743902758313594, "eval_python_code_alpaca_emb_cos_sim": 0.8505762815475464, "eval_python_code_alpaca_emb_cos_sim_sem": 0.012814910151064396, "eval_python_code_alpaca_emb_top1_equal": 0.984375, "eval_python_code_alpaca_emb_top1_equal_sem": 0.011004959233105183, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5106748342514038, "eval_python_code_alpaca_n_ngrams_match_1": 10.203125, "eval_python_code_alpaca_n_ngrams_match_2": 5.5078125, "eval_python_code_alpaca_n_ngrams_match_3": 3.15625, "eval_python_code_alpaca_num_pred_words": 17.6328125, "eval_python_code_alpaca_num_true_words": 18.6875, "eval_python_code_alpaca_perplexity": 4.5297866182725075, "eval_python_code_alpaca_pred_num_tokens": 25.28125, "eval_python_code_alpaca_rouge_score": 0.579491782244163, "eval_python_code_alpaca_runtime": 29.4672, "eval_python_code_alpaca_samples_per_second": 16.968, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6030208203383679, "eval_python_code_alpaca_token_set_f1_sem": 0.014232490686978566, "eval_python_code_alpaca_token_set_precision": 0.5919708147745858, "eval_python_code_alpaca_token_set_recall": 0.6209633598874555, "eval_python_code_alpaca_true_num_tokens": 24.390625, "step": 275000 }, { "epoch": 16.08469322103293, "eval_wikibio_accuracy": 0.364296875, "eval_wikibio_bleu_score": 6.603080689825587, "eval_wikibio_bleu_score_sem": 0.7218168314054502, "eval_wikibio_emb_cos_sim": 0.5796118974685669, "eval_wikibio_emb_cos_sim_sem": 0.023233644664287567, "eval_wikibio_emb_top1_equal": 0.90625, "eval_wikibio_emb_top1_equal_sem": 0.025864720344543457, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7501611709594727, "eval_wikibio_n_ngrams_match_1": 14.6328125, "eval_wikibio_n_ngrams_match_2": 4.8828125, "eval_wikibio_n_ngrams_match_3": 1.9375, "eval_wikibio_num_pred_words": 54.3984375, "eval_wikibio_num_true_words": 56.0234375, "eval_wikibio_perplexity": 15.645153225355653, "eval_wikibio_pred_num_tokens": 109.203125, "eval_wikibio_rouge_score": 0.26532974758434474, "eval_wikibio_runtime": 30.0933, "eval_wikibio_samples_per_second": 16.615, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2788955883617017, "eval_wikibio_token_set_f1_sem": 0.01421730492049839, "eval_wikibio_token_set_precision": 0.2446823724783626, "eval_wikibio_token_set_recall": 0.35888475560545124, "eval_wikibio_true_num_tokens": 103.9453125, "step": 275000 }, { "epoch": 16.08469322103293, "eval_msmarco_accuracy": 0.389015625, "eval_msmarco_bleu_score": 15.82386960258858, "eval_msmarco_bleu_score_sem": 1.431211950895183, "eval_msmarco_emb_cos_sim": 0.7590334415435791, "eval_msmarco_emb_cos_sim_sem": 0.016665931791067123, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8068020343780518, "eval_msmarco_n_ngrams_match_1": 28.0546875, "eval_msmarco_n_ngrams_match_2": 12.390625, "eval_msmarco_n_ngrams_match_3": 7.0625, "eval_msmarco_num_pred_words": 60.671875, "eval_msmarco_num_true_words": 63.9375, "eval_msmarco_perplexity": 6.090937643851068, "eval_msmarco_pred_num_tokens": 82.5703125, "eval_msmarco_rouge_score": 0.42947866840564386, "eval_msmarco_runtime": 24.7233, "eval_msmarco_samples_per_second": 20.224, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.45408172068819996, "eval_msmarco_token_set_f1_sem": 0.01398917070303394, "eval_msmarco_token_set_precision": 0.4088396504226974, "eval_msmarco_token_set_recall": 0.529205906725498, "eval_msmarco_true_num_tokens": 80.859375, "step": 275000 }, { "epoch": 16.090542200386032, "grad_norm": 0.26722511649131775, "learning_rate": 0.001, "loss": 1.9494, "step": 275100 }, { "epoch": 16.096391179739136, "grad_norm": 0.2410341203212738, "learning_rate": 0.001, "loss": 1.946, "step": 275200 }, { "epoch": 16.10224015909224, "grad_norm": 0.22087277472019196, "learning_rate": 0.001, "loss": 1.9469, "step": 275300 }, { "epoch": 16.10808913844534, "grad_norm": 0.2710227370262146, "learning_rate": 0.001, "loss": 1.9454, "step": 275400 }, { "epoch": 16.113938117798444, "grad_norm": 0.3188380300998688, "learning_rate": 0.001, "loss": 1.9476, "step": 275500 }, { "epoch": 16.119787097151548, "grad_norm": 0.36990055441856384, "learning_rate": 0.001, "loss": 1.9517, "step": 275600 }, { "epoch": 16.12563607650465, "grad_norm": 0.2898365557193756, "learning_rate": 0.001, "loss": 1.9468, "step": 275700 }, { "epoch": 16.131485055857752, "grad_norm": 0.232712984085083, "learning_rate": 0.001, "loss": 1.9471, "step": 275800 }, { "epoch": 16.137334035210856, "grad_norm": 0.2456212043762207, "learning_rate": 0.001, "loss": 1.9422, "step": 275900 }, { "epoch": 16.14318301456396, "grad_norm": 0.21017874777317047, "learning_rate": 0.001, "loss": 1.9435, "step": 276000 }, { "epoch": 16.149031993917063, "grad_norm": 0.2609722316265106, "learning_rate": 0.001, "loss": 1.9405, "step": 276100 }, { "epoch": 16.154880973270163, "grad_norm": 0.18899324536323547, "learning_rate": 0.001, "loss": 1.9425, "step": 276200 }, { "epoch": 16.160729952623267, "grad_norm": 0.22761715948581696, "learning_rate": 0.001, "loss": 1.9448, "step": 276300 }, { "epoch": 16.16657893197637, "grad_norm": 0.3159920275211334, "learning_rate": 0.001, "loss": 1.9501, "step": 276400 }, { "epoch": 16.17242791132947, "grad_norm": 0.19001489877700806, "learning_rate": 0.001, "loss": 1.9489, "step": 276500 }, { "epoch": 16.178276890682575, "grad_norm": 0.2490403652191162, "learning_rate": 0.001, "loss": 1.9424, "step": 276600 }, { "epoch": 16.18412587003568, "grad_norm": 0.22748030722141266, "learning_rate": 0.001, "loss": 1.9484, "step": 276700 }, { "epoch": 16.189974849388783, "grad_norm": 0.22319462895393372, "learning_rate": 0.001, "loss": 1.945, "step": 276800 }, { "epoch": 16.195823828741883, "grad_norm": 0.2775876820087433, "learning_rate": 0.001, "loss": 1.9458, "step": 276900 }, { "epoch": 16.201672808094987, "grad_norm": 0.306112140417099, "learning_rate": 0.001, "loss": 1.953, "step": 277000 }, { "epoch": 16.20752178744809, "grad_norm": 0.3116012513637543, "learning_rate": 0.001, "loss": 1.9511, "step": 277100 }, { "epoch": 16.213370766801194, "grad_norm": 0.2392006814479828, "learning_rate": 0.001, "loss": 1.947, "step": 277200 }, { "epoch": 16.219219746154295, "grad_norm": 0.2620299458503723, "learning_rate": 0.001, "loss": 1.9511, "step": 277300 }, { "epoch": 16.2250687255074, "grad_norm": 0.2775952219963074, "learning_rate": 0.001, "loss": 1.9464, "step": 277400 }, { "epoch": 16.230917704860502, "grad_norm": 0.1833045929670334, "learning_rate": 0.001, "loss": 1.9464, "step": 277500 }, { "epoch": 16.236766684213606, "grad_norm": 0.24070030450820923, "learning_rate": 0.001, "loss": 1.9461, "step": 277600 }, { "epoch": 16.242615663566706, "grad_norm": 0.23348446190357208, "learning_rate": 0.001, "loss": 1.9434, "step": 277700 }, { "epoch": 16.24846464291981, "grad_norm": 0.22312594950199127, "learning_rate": 0.001, "loss": 1.9474, "step": 277800 }, { "epoch": 16.254313622272914, "grad_norm": 0.22795718908309937, "learning_rate": 0.001, "loss": 1.9521, "step": 277900 }, { "epoch": 16.260162601626018, "grad_norm": 0.275328129529953, "learning_rate": 0.001, "loss": 1.9502, "step": 278000 }, { "epoch": 16.266011580979118, "grad_norm": 0.2539157569408417, "learning_rate": 0.001, "loss": 1.9487, "step": 278100 }, { "epoch": 16.27186056033222, "grad_norm": 0.2602057456970215, "learning_rate": 0.001, "loss": 1.9508, "step": 278200 }, { "epoch": 16.277709539685326, "grad_norm": 0.24653732776641846, "learning_rate": 0.001, "loss": 1.9474, "step": 278300 }, { "epoch": 16.28355851903843, "grad_norm": 0.27041059732437134, "learning_rate": 0.001, "loss": 1.946, "step": 278400 }, { "epoch": 16.28940749839153, "grad_norm": 0.21657022833824158, "learning_rate": 0.001, "loss": 1.9453, "step": 278500 }, { "epoch": 16.295256477744633, "grad_norm": 0.26565802097320557, "learning_rate": 0.001, "loss": 1.9454, "step": 278600 }, { "epoch": 16.301105457097737, "grad_norm": 0.17795993387699127, "learning_rate": 0.001, "loss": 1.9492, "step": 278700 }, { "epoch": 16.30695443645084, "grad_norm": 0.20061194896697998, "learning_rate": 0.001, "loss": 1.9438, "step": 278800 }, { "epoch": 16.31280341580394, "grad_norm": 0.26714572310447693, "learning_rate": 0.001, "loss": 1.9505, "step": 278900 }, { "epoch": 16.318652395157045, "grad_norm": 0.26640596985816956, "learning_rate": 0.001, "loss": 1.9578, "step": 279000 }, { "epoch": 16.32450137451015, "grad_norm": 0.28410792350769043, "learning_rate": 0.001, "loss": 1.9505, "step": 279100 }, { "epoch": 16.330350353863253, "grad_norm": 0.24325384199619293, "learning_rate": 0.001, "loss": 1.9497, "step": 279200 }, { "epoch": 16.336199333216353, "grad_norm": 0.23128534853458405, "learning_rate": 0.001, "loss": 1.9524, "step": 279300 }, { "epoch": 16.342048312569457, "grad_norm": 0.20845550298690796, "learning_rate": 0.001, "loss": 1.9503, "step": 279400 }, { "epoch": 16.34789729192256, "grad_norm": 0.2571016848087311, "learning_rate": 0.001, "loss": 1.9448, "step": 279500 }, { "epoch": 16.35374627127566, "grad_norm": 0.3604850769042969, "learning_rate": 0.001, "loss": 1.9439, "step": 279600 }, { "epoch": 16.359595250628765, "grad_norm": 0.30426207184791565, "learning_rate": 0.001, "loss": 1.951, "step": 279700 }, { "epoch": 16.36544422998187, "grad_norm": 0.24293969571590424, "learning_rate": 0.001, "loss": 1.9478, "step": 279800 }, { "epoch": 16.371293209334972, "grad_norm": 0.24925394356250763, "learning_rate": 0.001, "loss": 1.9425, "step": 279900 }, { "epoch": 16.377142188688072, "grad_norm": 0.28728175163269043, "learning_rate": 0.001, "loss": 1.9554, "step": 280000 }, { "epoch": 16.382991168041176, "grad_norm": 0.28660035133361816, "learning_rate": 0.001, "loss": 1.9471, "step": 280100 }, { "epoch": 16.38884014739428, "grad_norm": 0.2697922885417938, "learning_rate": 0.001, "loss": 1.9508, "step": 280200 }, { "epoch": 16.394689126747384, "grad_norm": 0.23605409264564514, "learning_rate": 0.001, "loss": 1.9547, "step": 280300 }, { "epoch": 16.400538106100484, "grad_norm": 0.3188628554344177, "learning_rate": 0.001, "loss": 1.9483, "step": 280400 }, { "epoch": 16.406387085453588, "grad_norm": 0.2282750904560089, "learning_rate": 0.001, "loss": 1.9459, "step": 280500 }, { "epoch": 16.41223606480669, "grad_norm": 0.21471980214118958, "learning_rate": 0.001, "loss": 1.9498, "step": 280600 }, { "epoch": 16.418085044159795, "grad_norm": 0.20006081461906433, "learning_rate": 0.001, "loss": 1.9515, "step": 280700 }, { "epoch": 16.423934023512896, "grad_norm": 0.3106052577495575, "learning_rate": 0.001, "loss": 1.9475, "step": 280800 }, { "epoch": 16.429783002866, "grad_norm": 0.3145497441291809, "learning_rate": 0.001, "loss": 1.9541, "step": 280900 }, { "epoch": 16.435631982219103, "grad_norm": 0.2677449584007263, "learning_rate": 0.001, "loss": 1.9546, "step": 281000 }, { "epoch": 16.441480961572207, "grad_norm": 0.19074220955371857, "learning_rate": 0.001, "loss": 1.9449, "step": 281100 }, { "epoch": 16.447329940925307, "grad_norm": 0.17883767187595367, "learning_rate": 0.001, "loss": 1.9567, "step": 281200 }, { "epoch": 16.45317892027841, "grad_norm": 0.23145119845867157, "learning_rate": 0.001, "loss": 1.9521, "step": 281300 }, { "epoch": 16.459027899631515, "grad_norm": 0.30106353759765625, "learning_rate": 0.001, "loss": 1.9541, "step": 281400 }, { "epoch": 16.46487687898462, "grad_norm": 0.3922279179096222, "learning_rate": 0.001, "loss": 1.9446, "step": 281500 }, { "epoch": 16.47072585833772, "grad_norm": 0.23787298798561096, "learning_rate": 0.001, "loss": 1.9455, "step": 281600 }, { "epoch": 16.476574837690823, "grad_norm": 0.2873122990131378, "learning_rate": 0.001, "loss": 1.952, "step": 281700 }, { "epoch": 16.482423817043927, "grad_norm": 0.2764539420604706, "learning_rate": 0.001, "loss": 1.9501, "step": 281800 }, { "epoch": 16.48827279639703, "grad_norm": 0.23627305030822754, "learning_rate": 0.001, "loss": 1.9473, "step": 281900 }, { "epoch": 16.49412177575013, "grad_norm": 0.3049956262111664, "learning_rate": 0.001, "loss": 1.9559, "step": 282000 }, { "epoch": 16.499970755103234, "grad_norm": 0.2460823953151703, "learning_rate": 0.001, "loss": 1.9482, "step": 282100 }, { "epoch": 16.505819734456338, "grad_norm": 0.1842980533838272, "learning_rate": 0.001, "loss": 1.9461, "step": 282200 }, { "epoch": 16.511668713809442, "grad_norm": 0.23649190366268158, "learning_rate": 0.001, "loss": 1.9445, "step": 282300 }, { "epoch": 16.517517693162542, "grad_norm": 0.22140814363956451, "learning_rate": 0.001, "loss": 1.9527, "step": 282400 }, { "epoch": 16.523366672515646, "grad_norm": 0.2171112447977066, "learning_rate": 0.001, "loss": 1.9504, "step": 282500 }, { "epoch": 16.52921565186875, "grad_norm": 0.24690265953540802, "learning_rate": 0.001, "loss": 1.9533, "step": 282600 }, { "epoch": 16.53506463122185, "grad_norm": 0.20372754335403442, "learning_rate": 0.001, "loss": 1.9513, "step": 282700 }, { "epoch": 16.540913610574954, "grad_norm": 0.25624388456344604, "learning_rate": 0.001, "loss": 1.9447, "step": 282800 }, { "epoch": 16.546762589928058, "grad_norm": 0.25899508595466614, "learning_rate": 0.001, "loss": 1.9525, "step": 282900 }, { "epoch": 16.55261156928116, "grad_norm": 0.21050631999969482, "learning_rate": 0.001, "loss": 1.9504, "step": 283000 }, { "epoch": 16.55846054863426, "grad_norm": 0.25759705901145935, "learning_rate": 0.001, "loss": 1.9513, "step": 283100 }, { "epoch": 16.564309527987366, "grad_norm": 0.2308056652545929, "learning_rate": 0.001, "loss": 1.9524, "step": 283200 }, { "epoch": 16.57015850734047, "grad_norm": 0.2315148264169693, "learning_rate": 0.001, "loss": 1.9473, "step": 283300 }, { "epoch": 16.576007486693573, "grad_norm": 0.20762082934379578, "learning_rate": 0.001, "loss": 1.9534, "step": 283400 }, { "epoch": 16.581856466046673, "grad_norm": 0.32185229659080505, "learning_rate": 0.001, "loss": 1.953, "step": 283500 }, { "epoch": 16.587705445399777, "grad_norm": 0.22762511670589447, "learning_rate": 0.001, "loss": 1.941, "step": 283600 }, { "epoch": 16.59355442475288, "grad_norm": 0.28306451439857483, "learning_rate": 0.001, "loss": 1.9499, "step": 283700 }, { "epoch": 16.599403404105985, "grad_norm": 0.2932678759098053, "learning_rate": 0.001, "loss": 1.9529, "step": 283800 }, { "epoch": 16.605252383459085, "grad_norm": 0.4094662666320801, "learning_rate": 0.001, "loss": 1.9542, "step": 283900 }, { "epoch": 16.61110136281219, "grad_norm": 0.19756953418254852, "learning_rate": 0.001, "loss": 1.948, "step": 284000 }, { "epoch": 16.616950342165293, "grad_norm": 0.2376871109008789, "learning_rate": 0.001, "loss": 1.9513, "step": 284100 }, { "epoch": 16.622799321518396, "grad_norm": 0.25771746039390564, "learning_rate": 0.001, "loss": 1.9457, "step": 284200 }, { "epoch": 16.628648300871497, "grad_norm": 0.32781997323036194, "learning_rate": 0.001, "loss": 1.9462, "step": 284300 }, { "epoch": 16.6344972802246, "grad_norm": 0.2561827600002289, "learning_rate": 0.001, "loss": 1.9486, "step": 284400 }, { "epoch": 16.640346259577704, "grad_norm": 0.28394827246665955, "learning_rate": 0.001, "loss": 1.9511, "step": 284500 }, { "epoch": 16.646195238930808, "grad_norm": 0.2535052001476288, "learning_rate": 0.001, "loss": 1.9516, "step": 284600 }, { "epoch": 16.65204421828391, "grad_norm": 0.2388356477022171, "learning_rate": 0.001, "loss": 1.9551, "step": 284700 }, { "epoch": 16.657893197637012, "grad_norm": 0.2715851962566376, "learning_rate": 0.001, "loss": 1.9485, "step": 284800 }, { "epoch": 16.663742176990116, "grad_norm": 0.23583641648292542, "learning_rate": 0.001, "loss": 1.9525, "step": 284900 }, { "epoch": 16.66959115634322, "grad_norm": 0.2448621392250061, "learning_rate": 0.001, "loss": 1.9531, "step": 285000 }, { "epoch": 16.67544013569632, "grad_norm": 0.22949515283107758, "learning_rate": 0.001, "loss": 1.9505, "step": 285100 }, { "epoch": 16.681289115049424, "grad_norm": 0.23477528989315033, "learning_rate": 0.001, "loss": 1.9509, "step": 285200 }, { "epoch": 16.687138094402528, "grad_norm": 0.2479957938194275, "learning_rate": 0.001, "loss": 1.9559, "step": 285300 }, { "epoch": 16.69298707375563, "grad_norm": 0.23732322454452515, "learning_rate": 0.001, "loss": 1.9478, "step": 285400 }, { "epoch": 16.69883605310873, "grad_norm": 0.18850401043891907, "learning_rate": 0.001, "loss": 1.9481, "step": 285500 }, { "epoch": 16.704685032461835, "grad_norm": 0.21397563815116882, "learning_rate": 0.001, "loss": 1.9494, "step": 285600 }, { "epoch": 16.71053401181494, "grad_norm": 0.1887630671262741, "learning_rate": 0.001, "loss": 1.953, "step": 285700 }, { "epoch": 16.71638299116804, "grad_norm": 0.2260453701019287, "learning_rate": 0.001, "loss": 1.9559, "step": 285800 }, { "epoch": 16.722231970521143, "grad_norm": 0.3030242621898651, "learning_rate": 0.001, "loss": 1.9516, "step": 285900 }, { "epoch": 16.728080949874247, "grad_norm": 0.21557562053203583, "learning_rate": 0.001, "loss": 1.9477, "step": 286000 }, { "epoch": 16.73392992922735, "grad_norm": 0.23547595739364624, "learning_rate": 0.001, "loss": 1.9455, "step": 286100 }, { "epoch": 16.73977890858045, "grad_norm": 0.2047523856163025, "learning_rate": 0.001, "loss": 1.9423, "step": 286200 }, { "epoch": 16.745627887933555, "grad_norm": 0.2624250650405884, "learning_rate": 0.001, "loss": 1.948, "step": 286300 }, { "epoch": 16.75147686728666, "grad_norm": 0.21646060049533844, "learning_rate": 0.001, "loss": 1.949, "step": 286400 }, { "epoch": 16.757325846639763, "grad_norm": 0.2366533726453781, "learning_rate": 0.001, "loss": 1.9487, "step": 286500 }, { "epoch": 16.763174825992863, "grad_norm": 0.1771666556596756, "learning_rate": 0.001, "loss": 1.9443, "step": 286600 }, { "epoch": 16.769023805345967, "grad_norm": 0.25607040524482727, "learning_rate": 0.001, "loss": 1.9552, "step": 286700 }, { "epoch": 16.77487278469907, "grad_norm": 0.2601061165332794, "learning_rate": 0.001, "loss": 1.9429, "step": 286800 }, { "epoch": 16.780721764052174, "grad_norm": 0.22040899097919464, "learning_rate": 0.001, "loss": 1.9491, "step": 286900 }, { "epoch": 16.786570743405274, "grad_norm": 0.20863832533359528, "learning_rate": 0.001, "loss": 1.953, "step": 287000 }, { "epoch": 16.79241972275838, "grad_norm": 0.15585534274578094, "learning_rate": 0.001, "loss": 1.9516, "step": 287100 }, { "epoch": 16.798268702111482, "grad_norm": 0.37037304043769836, "learning_rate": 0.001, "loss": 1.9562, "step": 287200 }, { "epoch": 16.804117681464586, "grad_norm": 0.31126388907432556, "learning_rate": 0.001, "loss": 1.9568, "step": 287300 }, { "epoch": 16.809966660817686, "grad_norm": 0.29085686802864075, "learning_rate": 0.001, "loss": 1.95, "step": 287400 }, { "epoch": 16.81581564017079, "grad_norm": 0.2332916408777237, "learning_rate": 0.001, "loss": 1.9472, "step": 287500 }, { "epoch": 16.821664619523894, "grad_norm": 0.26883599162101746, "learning_rate": 0.001, "loss": 1.946, "step": 287600 }, { "epoch": 16.827513598876997, "grad_norm": 0.243086576461792, "learning_rate": 0.001, "loss": 1.9477, "step": 287700 }, { "epoch": 16.833362578230098, "grad_norm": 0.26908040046691895, "learning_rate": 0.001, "loss": 1.9479, "step": 287800 }, { "epoch": 16.8392115575832, "grad_norm": 0.22997482120990753, "learning_rate": 0.001, "loss": 1.952, "step": 287900 }, { "epoch": 16.845060536936305, "grad_norm": 0.2567038834095001, "learning_rate": 0.001, "loss": 1.946, "step": 288000 }, { "epoch": 16.85090951628941, "grad_norm": 0.24211068451404572, "learning_rate": 0.001, "loss": 1.9593, "step": 288100 }, { "epoch": 16.85675849564251, "grad_norm": 0.34210240840911865, "learning_rate": 0.001, "loss": 1.951, "step": 288200 }, { "epoch": 16.862607474995613, "grad_norm": 0.242950439453125, "learning_rate": 0.001, "loss": 1.9529, "step": 288300 }, { "epoch": 16.868456454348717, "grad_norm": 0.16129636764526367, "learning_rate": 0.001, "loss": 1.9551, "step": 288400 }, { "epoch": 16.87430543370182, "grad_norm": 0.2812546193599701, "learning_rate": 0.001, "loss": 1.9476, "step": 288500 }, { "epoch": 16.88015441305492, "grad_norm": 0.37193763256073, "learning_rate": 0.001, "loss": 1.9595, "step": 288600 }, { "epoch": 16.886003392408025, "grad_norm": 0.25899145007133484, "learning_rate": 0.001, "loss": 1.9486, "step": 288700 }, { "epoch": 16.89185237176113, "grad_norm": 0.30308860540390015, "learning_rate": 0.001, "loss": 1.9507, "step": 288800 }, { "epoch": 16.89770135111423, "grad_norm": 0.29305440187454224, "learning_rate": 0.001, "loss": 1.9485, "step": 288900 }, { "epoch": 16.903550330467333, "grad_norm": 0.21654193103313446, "learning_rate": 0.001, "loss": 1.953, "step": 289000 }, { "epoch": 16.909399309820436, "grad_norm": 0.23880666494369507, "learning_rate": 0.001, "loss": 1.9459, "step": 289100 }, { "epoch": 16.91524828917354, "grad_norm": 0.18415948748588562, "learning_rate": 0.001, "loss": 1.9541, "step": 289200 }, { "epoch": 16.92109726852664, "grad_norm": 0.19845519959926605, "learning_rate": 0.001, "loss": 1.9596, "step": 289300 }, { "epoch": 16.926946247879744, "grad_norm": 0.2431439757347107, "learning_rate": 0.001, "loss": 1.9541, "step": 289400 }, { "epoch": 16.932795227232848, "grad_norm": 0.2635173499584198, "learning_rate": 0.001, "loss": 1.9542, "step": 289500 }, { "epoch": 16.938644206585952, "grad_norm": 0.25687113404273987, "learning_rate": 0.001, "loss": 1.9545, "step": 289600 }, { "epoch": 16.944493185939052, "grad_norm": 0.2091563642024994, "learning_rate": 0.001, "loss": 1.9587, "step": 289700 }, { "epoch": 16.950342165292156, "grad_norm": 0.23007017374038696, "learning_rate": 0.001, "loss": 1.9482, "step": 289800 }, { "epoch": 16.95619114464526, "grad_norm": 0.24348552525043488, "learning_rate": 0.001, "loss": 1.9475, "step": 289900 }, { "epoch": 16.962040123998364, "grad_norm": 0.1967957466840744, "learning_rate": 0.001, "loss": 1.9517, "step": 290000 }, { "epoch": 16.967889103351464, "grad_norm": 0.20912472903728485, "learning_rate": 0.001, "loss": 1.9593, "step": 290100 }, { "epoch": 16.973738082704568, "grad_norm": 0.25374117493629456, "learning_rate": 0.001, "loss": 1.9501, "step": 290200 }, { "epoch": 16.97958706205767, "grad_norm": 0.2960880398750305, "learning_rate": 0.001, "loss": 1.9536, "step": 290300 }, { "epoch": 16.985436041410775, "grad_norm": 0.21243582665920258, "learning_rate": 0.001, "loss": 1.9504, "step": 290400 }, { "epoch": 16.991285020763875, "grad_norm": 0.222451314330101, "learning_rate": 0.001, "loss": 1.95, "step": 290500 }, { "epoch": 16.99713400011698, "grad_norm": 0.22519798576831818, "learning_rate": 0.001, "loss": 1.9454, "step": 290600 }, { "epoch": 17.002982979470083, "grad_norm": 0.2070201188325882, "learning_rate": 0.001, "loss": 1.9412, "step": 290700 }, { "epoch": 17.008831958823187, "grad_norm": 0.31184253096580505, "learning_rate": 0.001, "loss": 1.928, "step": 290800 }, { "epoch": 17.014680938176287, "grad_norm": 0.21995311975479126, "learning_rate": 0.001, "loss": 1.942, "step": 290900 }, { "epoch": 17.02052991752939, "grad_norm": 0.20177200436592102, "learning_rate": 0.001, "loss": 1.9322, "step": 291000 }, { "epoch": 17.026378896882495, "grad_norm": 0.2609807550907135, "learning_rate": 0.001, "loss": 1.9377, "step": 291100 }, { "epoch": 17.0322278762356, "grad_norm": 0.25263717770576477, "learning_rate": 0.001, "loss": 1.9405, "step": 291200 }, { "epoch": 17.0380768555887, "grad_norm": 0.23418225347995758, "learning_rate": 0.001, "loss": 1.939, "step": 291300 }, { "epoch": 17.043925834941803, "grad_norm": 0.21050295233726501, "learning_rate": 0.001, "loss": 1.935, "step": 291400 }, { "epoch": 17.049774814294906, "grad_norm": 0.29010823369026184, "learning_rate": 0.001, "loss": 1.9344, "step": 291500 }, { "epoch": 17.05562379364801, "grad_norm": 0.1937337964773178, "learning_rate": 0.001, "loss": 1.9348, "step": 291600 }, { "epoch": 17.06147277300111, "grad_norm": 0.1611846387386322, "learning_rate": 0.001, "loss": 1.9297, "step": 291700 }, { "epoch": 17.067321752354214, "grad_norm": 0.162088081240654, "learning_rate": 0.001, "loss": 1.9356, "step": 291800 }, { "epoch": 17.073170731707318, "grad_norm": 0.32226958870887756, "learning_rate": 0.001, "loss": 1.9336, "step": 291900 }, { "epoch": 17.07901971106042, "grad_norm": 0.22583086788654327, "learning_rate": 0.001, "loss": 1.9361, "step": 292000 }, { "epoch": 17.084868690413522, "grad_norm": 0.2400020956993103, "learning_rate": 0.001, "loss": 1.9327, "step": 292100 }, { "epoch": 17.090717669766626, "grad_norm": 0.3220212757587433, "learning_rate": 0.001, "loss": 1.9427, "step": 292200 }, { "epoch": 17.09656664911973, "grad_norm": 0.23143669962882996, "learning_rate": 0.001, "loss": 1.9351, "step": 292300 }, { "epoch": 17.10241562847283, "grad_norm": 0.19350272417068481, "learning_rate": 0.001, "loss": 1.9382, "step": 292400 }, { "epoch": 17.108264607825934, "grad_norm": 0.20278696715831757, "learning_rate": 0.001, "loss": 1.9385, "step": 292500 }, { "epoch": 17.114113587179038, "grad_norm": 0.300204336643219, "learning_rate": 0.001, "loss": 1.9391, "step": 292600 }, { "epoch": 17.11996256653214, "grad_norm": 0.2655709683895111, "learning_rate": 0.001, "loss": 1.9406, "step": 292700 }, { "epoch": 17.12581154588524, "grad_norm": 0.39564424753189087, "learning_rate": 0.001, "loss": 1.9413, "step": 292800 }, { "epoch": 17.131660525238345, "grad_norm": 0.24906161427497864, "learning_rate": 0.001, "loss": 1.9347, "step": 292900 }, { "epoch": 17.13750950459145, "grad_norm": 0.19004255533218384, "learning_rate": 0.001, "loss": 1.9315, "step": 293000 }, { "epoch": 17.143358483944553, "grad_norm": 0.1908150017261505, "learning_rate": 0.001, "loss": 1.9333, "step": 293100 }, { "epoch": 17.149207463297653, "grad_norm": 0.19346043467521667, "learning_rate": 0.001, "loss": 1.9371, "step": 293200 }, { "epoch": 17.155056442650757, "grad_norm": 0.17890717089176178, "learning_rate": 0.001, "loss": 1.9326, "step": 293300 }, { "epoch": 17.16090542200386, "grad_norm": 0.21539254486560822, "learning_rate": 0.001, "loss": 1.9338, "step": 293400 }, { "epoch": 17.166754401356965, "grad_norm": 0.22389566898345947, "learning_rate": 0.001, "loss": 1.9361, "step": 293500 }, { "epoch": 17.172603380710065, "grad_norm": 0.2578187882900238, "learning_rate": 0.001, "loss": 1.9321, "step": 293600 }, { "epoch": 17.17845236006317, "grad_norm": 0.2133169025182724, "learning_rate": 0.001, "loss": 1.9302, "step": 293700 }, { "epoch": 17.184301339416272, "grad_norm": 0.2573752999305725, "learning_rate": 0.001, "loss": 1.9416, "step": 293800 }, { "epoch": 17.190150318769376, "grad_norm": 0.1910058856010437, "learning_rate": 0.001, "loss": 1.9389, "step": 293900 }, { "epoch": 17.195999298122477, "grad_norm": 0.19667769968509674, "learning_rate": 0.001, "loss": 1.9368, "step": 294000 }, { "epoch": 17.20184827747558, "grad_norm": 0.25203651189804077, "learning_rate": 0.001, "loss": 1.9346, "step": 294100 }, { "epoch": 17.207697256828684, "grad_norm": 0.1878858357667923, "learning_rate": 0.001, "loss": 1.9399, "step": 294200 }, { "epoch": 17.213546236181788, "grad_norm": 0.24638308584690094, "learning_rate": 0.001, "loss": 1.945, "step": 294300 }, { "epoch": 17.219395215534888, "grad_norm": 0.2131374180316925, "learning_rate": 0.001, "loss": 1.9382, "step": 294400 }, { "epoch": 17.225244194887992, "grad_norm": 0.23746992647647858, "learning_rate": 0.001, "loss": 1.9323, "step": 294500 }, { "epoch": 17.231093174241096, "grad_norm": 0.16879801452159882, "learning_rate": 0.001, "loss": 1.9402, "step": 294600 }, { "epoch": 17.2369421535942, "grad_norm": 0.22942675650119781, "learning_rate": 0.001, "loss": 1.933, "step": 294700 }, { "epoch": 17.2427911329473, "grad_norm": 0.2467707097530365, "learning_rate": 0.001, "loss": 1.9445, "step": 294800 }, { "epoch": 17.248640112300404, "grad_norm": 0.18033242225646973, "learning_rate": 0.001, "loss": 1.9434, "step": 294900 }, { "epoch": 17.254489091653507, "grad_norm": 0.35132360458374023, "learning_rate": 0.001, "loss": 1.9439, "step": 295000 }, { "epoch": 17.260338071006608, "grad_norm": 0.18494510650634766, "learning_rate": 0.001, "loss": 1.9369, "step": 295100 }, { "epoch": 17.26618705035971, "grad_norm": 0.27197176218032837, "learning_rate": 0.001, "loss": 1.941, "step": 295200 }, { "epoch": 17.272036029712815, "grad_norm": 0.3333626389503479, "learning_rate": 0.001, "loss": 1.9411, "step": 295300 }, { "epoch": 17.27788500906592, "grad_norm": 0.34750238060951233, "learning_rate": 0.001, "loss": 1.9351, "step": 295400 }, { "epoch": 17.28373398841902, "grad_norm": 0.4507924020290375, "learning_rate": 0.001, "loss": 1.9457, "step": 295500 }, { "epoch": 17.289582967772123, "grad_norm": 0.18575666844844818, "learning_rate": 0.001, "loss": 1.9474, "step": 295600 }, { "epoch": 17.295431947125227, "grad_norm": 0.20432321727275848, "learning_rate": 0.001, "loss": 1.9328, "step": 295700 }, { "epoch": 17.30128092647833, "grad_norm": 0.1730693131685257, "learning_rate": 0.001, "loss": 1.9426, "step": 295800 }, { "epoch": 17.30712990583143, "grad_norm": 0.18717139959335327, "learning_rate": 0.001, "loss": 1.9399, "step": 295900 }, { "epoch": 17.312978885184535, "grad_norm": 0.19075246155261993, "learning_rate": 0.001, "loss": 1.9385, "step": 296000 }, { "epoch": 17.31882786453764, "grad_norm": 0.20600725710391998, "learning_rate": 0.001, "loss": 1.9421, "step": 296100 }, { "epoch": 17.324676843890742, "grad_norm": 0.23719404637813568, "learning_rate": 0.001, "loss": 1.943, "step": 296200 }, { "epoch": 17.330525823243843, "grad_norm": 0.18517239391803741, "learning_rate": 0.001, "loss": 1.9449, "step": 296300 }, { "epoch": 17.336374802596946, "grad_norm": 0.1455155611038208, "learning_rate": 0.001, "loss": 1.9353, "step": 296400 }, { "epoch": 17.34222378195005, "grad_norm": 0.18741832673549652, "learning_rate": 0.001, "loss": 1.9383, "step": 296500 }, { "epoch": 17.348072761303154, "grad_norm": 0.17004072666168213, "learning_rate": 0.001, "loss": 1.9412, "step": 296600 }, { "epoch": 17.353921740656254, "grad_norm": 0.2767903208732605, "learning_rate": 0.001, "loss": 1.9437, "step": 296700 }, { "epoch": 17.359770720009358, "grad_norm": 0.22487911581993103, "learning_rate": 0.001, "loss": 1.9385, "step": 296800 }, { "epoch": 17.365619699362462, "grad_norm": 0.25966259837150574, "learning_rate": 0.001, "loss": 1.9406, "step": 296900 }, { "epoch": 17.371468678715566, "grad_norm": 0.23301354050636292, "learning_rate": 0.001, "loss": 1.9458, "step": 297000 }, { "epoch": 17.377317658068666, "grad_norm": 0.208836629986763, "learning_rate": 0.001, "loss": 1.9411, "step": 297100 }, { "epoch": 17.38316663742177, "grad_norm": 0.20812518894672394, "learning_rate": 0.001, "loss": 1.937, "step": 297200 }, { "epoch": 17.389015616774874, "grad_norm": 0.21840788424015045, "learning_rate": 0.001, "loss": 1.941, "step": 297300 }, { "epoch": 17.394864596127977, "grad_norm": 0.17017662525177002, "learning_rate": 0.001, "loss": 1.9448, "step": 297400 }, { "epoch": 17.400713575481078, "grad_norm": 0.19330397248268127, "learning_rate": 0.001, "loss": 1.9419, "step": 297500 }, { "epoch": 17.40656255483418, "grad_norm": 0.23898272216320038, "learning_rate": 0.001, "loss": 1.9393, "step": 297600 }, { "epoch": 17.412411534187285, "grad_norm": 0.2411794811487198, "learning_rate": 0.001, "loss": 1.9429, "step": 297700 }, { "epoch": 17.41826051354039, "grad_norm": 0.1942555010318756, "learning_rate": 0.001, "loss": 1.9431, "step": 297800 }, { "epoch": 17.42410949289349, "grad_norm": 0.26566413044929504, "learning_rate": 0.001, "loss": 1.9428, "step": 297900 }, { "epoch": 17.429958472246593, "grad_norm": 0.16736818850040436, "learning_rate": 0.001, "loss": 1.9395, "step": 298000 }, { "epoch": 17.435807451599697, "grad_norm": 0.45521822571754456, "learning_rate": 0.001, "loss": 1.9502, "step": 298100 }, { "epoch": 17.441656430952797, "grad_norm": 0.32742980122566223, "learning_rate": 0.001, "loss": 1.9478, "step": 298200 }, { "epoch": 17.4475054103059, "grad_norm": 0.20074817538261414, "learning_rate": 0.001, "loss": 1.943, "step": 298300 }, { "epoch": 17.453354389659005, "grad_norm": 0.2333943247795105, "learning_rate": 0.001, "loss": 1.9366, "step": 298400 }, { "epoch": 17.45920336901211, "grad_norm": 0.2724263668060303, "learning_rate": 0.001, "loss": 1.9411, "step": 298500 }, { "epoch": 17.46505234836521, "grad_norm": 0.2173369973897934, "learning_rate": 0.001, "loss": 1.9468, "step": 298600 }, { "epoch": 17.470901327718313, "grad_norm": 0.17673556506633759, "learning_rate": 0.001, "loss": 1.9465, "step": 298700 }, { "epoch": 17.476750307071416, "grad_norm": 0.2533077001571655, "learning_rate": 0.001, "loss": 1.9433, "step": 298800 }, { "epoch": 17.48259928642452, "grad_norm": 0.23407499492168427, "learning_rate": 0.001, "loss": 1.9409, "step": 298900 }, { "epoch": 17.48844826577762, "grad_norm": 0.24491439759731293, "learning_rate": 0.001, "loss": 1.9405, "step": 299000 }, { "epoch": 17.494297245130724, "grad_norm": 0.2992870807647705, "learning_rate": 0.001, "loss": 1.9409, "step": 299100 }, { "epoch": 17.500146224483828, "grad_norm": 0.22648338973522186, "learning_rate": 0.001, "loss": 1.9476, "step": 299200 }, { "epoch": 17.50599520383693, "grad_norm": 0.22134284675121307, "learning_rate": 0.001, "loss": 1.9389, "step": 299300 }, { "epoch": 17.511844183190032, "grad_norm": 0.23938876390457153, "learning_rate": 0.001, "loss": 1.9469, "step": 299400 }, { "epoch": 17.517693162543136, "grad_norm": 0.26311561465263367, "learning_rate": 0.001, "loss": 1.9427, "step": 299500 }, { "epoch": 17.52354214189624, "grad_norm": 0.2503843307495117, "learning_rate": 0.001, "loss": 1.9459, "step": 299600 }, { "epoch": 17.529391121249343, "grad_norm": 0.22658583521842957, "learning_rate": 0.001, "loss": 1.9433, "step": 299700 }, { "epoch": 17.535240100602444, "grad_norm": 0.36413997411727905, "learning_rate": 0.001, "loss": 1.9418, "step": 299800 }, { "epoch": 17.541089079955547, "grad_norm": 0.14254996180534363, "learning_rate": 0.001, "loss": 1.9287, "step": 299900 }, { "epoch": 17.54693805930865, "grad_norm": 0.24154981970787048, "learning_rate": 0.001, "loss": 1.9436, "step": 300000 }, { "epoch": 17.54693805930865, "eval_ag_news_accuracy": 0.236609375, "eval_ag_news_bleu_score": 6.600678215422683, "eval_ag_news_bleu_score_sem": 0.4990820265625752, "eval_ag_news_emb_cos_sim": 0.6912261843681335, "eval_ag_news_emb_cos_sim_sem": 0.014211337082087994, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7553083896636963, "eval_ag_news_n_ngrams_match_1": 13.921875, "eval_ag_news_n_ngrams_match_2": 3.671875, "eval_ag_news_n_ngrams_match_3": 1.3203125, "eval_ag_news_num_pred_words": 47.7265625, "eval_ag_news_num_true_words": 46.15625, "eval_ag_news_perplexity": 15.72588985696581, "eval_ag_news_pred_num_tokens": 72.1875, "eval_ag_news_rouge_score": 0.2832759825199893, "eval_ag_news_runtime": 38.1912, "eval_ag_news_samples_per_second": 13.092, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.32180909686376336, "eval_ag_news_token_set_f1_sem": 0.01002252492536281, "eval_ag_news_token_set_precision": 0.30022478505123357, "eval_ag_news_token_set_recall": 0.363304384857291, "eval_ag_news_true_num_tokens": 62.9921875, "step": 300000 }, { "epoch": 17.54693805930865, "eval_anthropic_toxic_prompts_accuracy": 0.101625, "eval_anthropic_toxic_prompts_bleu_score": 40.2590524156886, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.6209948413731947, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8749348521232605, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009697386994957924, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02854125206846796, "eval_anthropic_toxic_prompts_loss": 1.2693959474563599, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.0234375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.3515625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.2890625, "eval_anthropic_toxic_prompts_num_pred_words": 14.8359375, "eval_anthropic_toxic_prompts_num_true_words": 14.6640625, "eval_anthropic_toxic_prompts_perplexity": 3.558702269817924, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.03125, "eval_anthropic_toxic_prompts_rouge_score": 0.6783069946898187, "eval_anthropic_toxic_prompts_runtime": 29.7005, "eval_anthropic_toxic_prompts_samples_per_second": 16.835, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6934215763299695, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018851146341120038, "eval_anthropic_toxic_prompts_token_set_precision": 0.6938791760793159, "eval_anthropic_toxic_prompts_token_set_recall": 0.697517794122652, "eval_anthropic_toxic_prompts_true_num_tokens": 18.0390625, "step": 300000 }, { "epoch": 17.54693805930865, "eval_arxiv_accuracy": 0.370671875, "eval_arxiv_bleu_score": 1.4874769607361609, "eval_arxiv_bleu_score_sem": 0.13336593256970627, "eval_arxiv_emb_cos_sim": 0.4343335032463074, "eval_arxiv_emb_cos_sim_sem": 0.02030111476778984, "eval_arxiv_emb_top1_equal": 0.859375, "eval_arxiv_emb_top1_equal_sem": 0.03084755875170231, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.475151538848877, "eval_arxiv_n_ngrams_match_1": 11.9765625, "eval_arxiv_n_ngrams_match_2": 1.984375, "eval_arxiv_n_ngrams_match_3": 0.40625, "eval_arxiv_num_pred_words": 51.4765625, "eval_arxiv_num_true_words": 86.7734375, "eval_arxiv_perplexity": 32.30272328323111, "eval_arxiv_pred_num_tokens": 125.90625, "eval_arxiv_rouge_score": 0.15866417789615117, "eval_arxiv_runtime": 30.482, "eval_arxiv_samples_per_second": 16.403, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.15812151815216124, "eval_arxiv_token_set_f1_sem": 0.008225326701702068, "eval_arxiv_token_set_precision": 0.10538360434589043, "eval_arxiv_token_set_recall": 0.389749402380288, "eval_arxiv_true_num_tokens": 125.1953125, "step": 300000 }, { "epoch": 17.54693805930865, "eval_python_code_alpaca_accuracy": 0.1295, "eval_python_code_alpaca_bleu_score": 26.869193707834505, "eval_python_code_alpaca_bleu_score_sem": 1.7609465700978424, "eval_python_code_alpaca_emb_cos_sim": 0.8460558652877808, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011474037542939186, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.5115605592727661, "eval_python_code_alpaca_n_ngrams_match_1": 10.8671875, "eval_python_code_alpaca_n_ngrams_match_2": 5.625, "eval_python_code_alpaca_n_ngrams_match_3": 3.2265625, "eval_python_code_alpaca_num_pred_words": 19.5546875, "eval_python_code_alpaca_num_true_words": 20.859375, "eval_python_code_alpaca_perplexity": 4.533800540975216, "eval_python_code_alpaca_pred_num_tokens": 27.71875, "eval_python_code_alpaca_rouge_score": 0.5865492330653328, "eval_python_code_alpaca_runtime": 30.8256, "eval_python_code_alpaca_samples_per_second": 16.22, "eval_python_code_alpaca_steps_per_second": 0.032, "eval_python_code_alpaca_token_set_f1": 0.5975433639488742, "eval_python_code_alpaca_token_set_f1_sem": 0.014580635919972707, "eval_python_code_alpaca_token_set_precision": 0.5778129960853593, "eval_python_code_alpaca_token_set_recall": 0.6282657264086853, "eval_python_code_alpaca_true_num_tokens": 27.3046875, "step": 300000 }, { "epoch": 17.54693805930865, "eval_wikibio_accuracy": 0.36203125, "eval_wikibio_bleu_score": 6.4126204544652285, "eval_wikibio_bleu_score_sem": 0.5913023041559217, "eval_wikibio_emb_cos_sim": 0.5902009010314941, "eval_wikibio_emb_cos_sim_sem": 0.023896491155028343, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.74698543548584, "eval_wikibio_n_ngrams_match_1": 13.8359375, "eval_wikibio_n_ngrams_match_2": 4.578125, "eval_wikibio_n_ngrams_match_3": 1.765625, "eval_wikibio_num_pred_words": 51.265625, "eval_wikibio_num_true_words": 51.8046875, "eval_wikibio_perplexity": 15.595547166818347, "eval_wikibio_pred_num_tokens": 104.40625, "eval_wikibio_rouge_score": 0.273421660916535, "eval_wikibio_runtime": 30.6609, "eval_wikibio_samples_per_second": 16.307, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.3048580527637976, "eval_wikibio_token_set_f1_sem": 0.013147472635420552, "eval_wikibio_token_set_precision": 0.26892601183863274, "eval_wikibio_token_set_recall": 0.40574088623143717, "eval_wikibio_true_num_tokens": 98.4296875, "step": 300000 }, { "epoch": 17.54693805930865, "eval_msmarco_accuracy": 0.38875, "eval_msmarco_bleu_score": 17.67957390945812, "eval_msmarco_bleu_score_sem": 1.4952816410008758, "eval_msmarco_emb_cos_sim": 0.7895345091819763, "eval_msmarco_emb_cos_sim_sem": 0.015201306901872158, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8218804597854614, "eval_msmarco_n_ngrams_match_1": 28.4921875, "eval_msmarco_n_ngrams_match_2": 13.1328125, "eval_msmarco_n_ngrams_match_3": 7.4765625, "eval_msmarco_num_pred_words": 61.1796875, "eval_msmarco_num_true_words": 61.5234375, "eval_msmarco_perplexity": 6.183475300587187, "eval_msmarco_pred_num_tokens": 82.4765625, "eval_msmarco_rouge_score": 0.4492612677355678, "eval_msmarco_runtime": 26.0703, "eval_msmarco_samples_per_second": 19.179, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.4767850735314191, "eval_msmarco_token_set_f1_sem": 0.014013365941299873, "eval_msmarco_token_set_precision": 0.4377552964760337, "eval_msmarco_token_set_recall": 0.5389553372415019, "eval_msmarco_true_num_tokens": 78.9453125, "step": 300000 }, { "epoch": 17.552787038661755, "grad_norm": 0.3075239956378937, "learning_rate": 0.001, "loss": 1.9453, "step": 300100 }, { "epoch": 17.558636018014855, "grad_norm": 0.32026106119155884, "learning_rate": 0.001, "loss": 1.9451, "step": 300200 }, { "epoch": 17.56448499736796, "grad_norm": 0.2148813009262085, "learning_rate": 0.001, "loss": 1.942, "step": 300300 }, { "epoch": 17.570333976721063, "grad_norm": 0.2656821608543396, "learning_rate": 0.001, "loss": 1.942, "step": 300400 }, { "epoch": 17.576182956074167, "grad_norm": 0.21693044900894165, "learning_rate": 0.001, "loss": 1.9426, "step": 300500 }, { "epoch": 17.582031935427267, "grad_norm": 0.2204744964838028, "learning_rate": 0.001, "loss": 1.9431, "step": 300600 }, { "epoch": 17.58788091478037, "grad_norm": 0.2703433632850647, "learning_rate": 0.001, "loss": 1.9435, "step": 300700 }, { "epoch": 17.593729894133475, "grad_norm": 0.28231754899024963, "learning_rate": 0.001, "loss": 1.9399, "step": 300800 }, { "epoch": 17.59957887348658, "grad_norm": 0.2401001751422882, "learning_rate": 0.001, "loss": 1.9447, "step": 300900 }, { "epoch": 17.60542785283968, "grad_norm": 0.20414766669273376, "learning_rate": 0.001, "loss": 1.9452, "step": 301000 }, { "epoch": 17.611276832192782, "grad_norm": 0.1724698543548584, "learning_rate": 0.001, "loss": 1.9363, "step": 301100 }, { "epoch": 17.617125811545886, "grad_norm": 0.19569402933120728, "learning_rate": 0.001, "loss": 1.9393, "step": 301200 }, { "epoch": 17.622974790898986, "grad_norm": 0.20125044882297516, "learning_rate": 0.001, "loss": 1.938, "step": 301300 }, { "epoch": 17.62882377025209, "grad_norm": 0.3836452066898346, "learning_rate": 0.001, "loss": 1.9442, "step": 301400 }, { "epoch": 17.634672749605194, "grad_norm": 0.35936635732650757, "learning_rate": 0.001, "loss": 1.9532, "step": 301500 }, { "epoch": 17.640521728958298, "grad_norm": 0.2544173002243042, "learning_rate": 0.001, "loss": 1.9446, "step": 301600 }, { "epoch": 17.646370708311398, "grad_norm": 0.22944216430187225, "learning_rate": 0.001, "loss": 1.942, "step": 301700 }, { "epoch": 17.652219687664502, "grad_norm": 0.19376809895038605, "learning_rate": 0.001, "loss": 1.9408, "step": 301800 }, { "epoch": 17.658068667017606, "grad_norm": 0.2501823604106903, "learning_rate": 0.001, "loss": 1.9402, "step": 301900 }, { "epoch": 17.66391764637071, "grad_norm": 0.25526779890060425, "learning_rate": 0.001, "loss": 1.9421, "step": 302000 }, { "epoch": 17.66976662572381, "grad_norm": 0.24061349034309387, "learning_rate": 0.001, "loss": 1.9464, "step": 302100 }, { "epoch": 17.675615605076914, "grad_norm": 0.30149614810943604, "learning_rate": 0.001, "loss": 1.9412, "step": 302200 }, { "epoch": 17.681464584430017, "grad_norm": 0.1926167905330658, "learning_rate": 0.001, "loss": 1.9455, "step": 302300 }, { "epoch": 17.68731356378312, "grad_norm": 0.2951646149158478, "learning_rate": 0.001, "loss": 1.941, "step": 302400 }, { "epoch": 17.69316254313622, "grad_norm": 0.27565374970436096, "learning_rate": 0.001, "loss": 1.9402, "step": 302500 }, { "epoch": 17.699011522489325, "grad_norm": 0.22560961544513702, "learning_rate": 0.001, "loss": 1.9419, "step": 302600 }, { "epoch": 17.70486050184243, "grad_norm": 0.1737891584634781, "learning_rate": 0.001, "loss": 1.9398, "step": 302700 }, { "epoch": 17.710709481195533, "grad_norm": 0.20271846652030945, "learning_rate": 0.001, "loss": 1.9382, "step": 302800 }, { "epoch": 17.716558460548633, "grad_norm": 0.23472225666046143, "learning_rate": 0.001, "loss": 1.947, "step": 302900 }, { "epoch": 17.722407439901737, "grad_norm": 0.2308504581451416, "learning_rate": 0.001, "loss": 1.9429, "step": 303000 }, { "epoch": 17.72825641925484, "grad_norm": 0.22644926607608795, "learning_rate": 0.001, "loss": 1.9384, "step": 303100 }, { "epoch": 17.734105398607944, "grad_norm": 0.18606658279895782, "learning_rate": 0.001, "loss": 1.9416, "step": 303200 }, { "epoch": 17.739954377961045, "grad_norm": 0.26970741152763367, "learning_rate": 0.001, "loss": 1.9346, "step": 303300 }, { "epoch": 17.74580335731415, "grad_norm": 0.23198202252388, "learning_rate": 0.001, "loss": 1.9453, "step": 303400 }, { "epoch": 17.751652336667252, "grad_norm": 0.21916235983371735, "learning_rate": 0.001, "loss": 1.9457, "step": 303500 }, { "epoch": 17.757501316020356, "grad_norm": 0.27452990412712097, "learning_rate": 0.001, "loss": 1.9419, "step": 303600 }, { "epoch": 17.763350295373456, "grad_norm": 0.39553773403167725, "learning_rate": 0.001, "loss": 1.9412, "step": 303700 }, { "epoch": 17.76919927472656, "grad_norm": 0.20509134232997894, "learning_rate": 0.001, "loss": 1.9446, "step": 303800 }, { "epoch": 17.775048254079664, "grad_norm": 0.1775122433900833, "learning_rate": 0.001, "loss": 1.9418, "step": 303900 }, { "epoch": 17.780897233432768, "grad_norm": 0.21711328625679016, "learning_rate": 0.001, "loss": 1.9425, "step": 304000 }, { "epoch": 17.786746212785868, "grad_norm": 0.3603789508342743, "learning_rate": 0.001, "loss": 1.9494, "step": 304100 }, { "epoch": 17.792595192138972, "grad_norm": 0.17837373912334442, "learning_rate": 0.001, "loss": 1.9474, "step": 304200 }, { "epoch": 17.798444171492076, "grad_norm": 0.24848859012126923, "learning_rate": 0.001, "loss": 1.9451, "step": 304300 }, { "epoch": 17.804293150845176, "grad_norm": 0.25937017798423767, "learning_rate": 0.001, "loss": 1.9415, "step": 304400 }, { "epoch": 17.81014213019828, "grad_norm": 0.241023451089859, "learning_rate": 0.001, "loss": 1.9456, "step": 304500 }, { "epoch": 17.815991109551383, "grad_norm": 0.21971668303012848, "learning_rate": 0.001, "loss": 1.9445, "step": 304600 }, { "epoch": 17.821840088904487, "grad_norm": 0.24685174226760864, "learning_rate": 0.001, "loss": 1.9395, "step": 304700 }, { "epoch": 17.827689068257587, "grad_norm": 0.21962955594062805, "learning_rate": 0.001, "loss": 1.9455, "step": 304800 }, { "epoch": 17.83353804761069, "grad_norm": 0.3375236690044403, "learning_rate": 0.001, "loss": 1.9407, "step": 304900 }, { "epoch": 17.839387026963795, "grad_norm": 0.32685354351997375, "learning_rate": 0.001, "loss": 1.9442, "step": 305000 }, { "epoch": 17.8452360063169, "grad_norm": 0.2933027446269989, "learning_rate": 0.001, "loss": 1.9492, "step": 305100 }, { "epoch": 17.85108498567, "grad_norm": 0.1855825036764145, "learning_rate": 0.001, "loss": 1.9397, "step": 305200 }, { "epoch": 17.856933965023103, "grad_norm": 0.20291782915592194, "learning_rate": 0.001, "loss": 1.9381, "step": 305300 }, { "epoch": 17.862782944376207, "grad_norm": 0.17651918530464172, "learning_rate": 0.001, "loss": 1.9451, "step": 305400 }, { "epoch": 17.86863192372931, "grad_norm": 0.15962086617946625, "learning_rate": 0.001, "loss": 1.9411, "step": 305500 }, { "epoch": 17.87448090308241, "grad_norm": 0.22995716333389282, "learning_rate": 0.001, "loss": 1.943, "step": 305600 }, { "epoch": 17.880329882435515, "grad_norm": 0.19060948491096497, "learning_rate": 0.001, "loss": 1.9392, "step": 305700 }, { "epoch": 17.88617886178862, "grad_norm": 0.2537078857421875, "learning_rate": 0.001, "loss": 1.9495, "step": 305800 }, { "epoch": 17.892027841141722, "grad_norm": 0.16928234696388245, "learning_rate": 0.001, "loss": 1.9407, "step": 305900 }, { "epoch": 17.897876820494822, "grad_norm": 0.1625509411096573, "learning_rate": 0.001, "loss": 1.942, "step": 306000 }, { "epoch": 17.903725799847926, "grad_norm": 0.2288513481616974, "learning_rate": 0.001, "loss": 1.9415, "step": 306100 }, { "epoch": 17.90957477920103, "grad_norm": 0.14674997329711914, "learning_rate": 0.001, "loss": 1.9368, "step": 306200 }, { "epoch": 17.915423758554134, "grad_norm": 0.16518659889698029, "learning_rate": 0.001, "loss": 1.9451, "step": 306300 }, { "epoch": 17.921272737907234, "grad_norm": 0.3030222952365875, "learning_rate": 0.001, "loss": 1.9425, "step": 306400 }, { "epoch": 17.927121717260338, "grad_norm": 0.20431722700595856, "learning_rate": 0.001, "loss": 1.9443, "step": 306500 }, { "epoch": 17.93297069661344, "grad_norm": 0.22539925575256348, "learning_rate": 0.001, "loss": 1.9437, "step": 306600 }, { "epoch": 17.938819675966545, "grad_norm": 0.1825663447380066, "learning_rate": 0.001, "loss": 1.9435, "step": 306700 }, { "epoch": 17.944668655319646, "grad_norm": 0.24068433046340942, "learning_rate": 0.001, "loss": 1.9374, "step": 306800 }, { "epoch": 17.95051763467275, "grad_norm": 0.24235346913337708, "learning_rate": 0.001, "loss": 1.9426, "step": 306900 }, { "epoch": 17.956366614025853, "grad_norm": 0.23330536484718323, "learning_rate": 0.001, "loss": 1.9452, "step": 307000 }, { "epoch": 17.962215593378957, "grad_norm": 0.21722246706485748, "learning_rate": 0.001, "loss": 1.9475, "step": 307100 }, { "epoch": 17.968064572732057, "grad_norm": 0.2345615029335022, "learning_rate": 0.001, "loss": 1.9405, "step": 307200 }, { "epoch": 17.97391355208516, "grad_norm": 0.2583528757095337, "learning_rate": 0.001, "loss": 1.9414, "step": 307300 }, { "epoch": 17.979762531438265, "grad_norm": 0.21906530857086182, "learning_rate": 0.001, "loss": 1.9545, "step": 307400 }, { "epoch": 17.985611510791365, "grad_norm": 0.2126021534204483, "learning_rate": 0.001, "loss": 1.9458, "step": 307500 }, { "epoch": 17.99146049014447, "grad_norm": 0.21245218813419342, "learning_rate": 0.001, "loss": 1.949, "step": 307600 }, { "epoch": 17.997309469497573, "grad_norm": 0.17717915773391724, "learning_rate": 0.001, "loss": 1.9487, "step": 307700 }, { "epoch": 18.003158448850677, "grad_norm": 0.2617747485637665, "learning_rate": 0.001, "loss": 1.939, "step": 307800 }, { "epoch": 18.009007428203777, "grad_norm": 0.1993645280599594, "learning_rate": 0.001, "loss": 1.9251, "step": 307900 }, { "epoch": 18.01485640755688, "grad_norm": 0.21609535813331604, "learning_rate": 0.001, "loss": 1.9235, "step": 308000 }, { "epoch": 18.020705386909984, "grad_norm": 0.20833264291286469, "learning_rate": 0.001, "loss": 1.9279, "step": 308100 }, { "epoch": 18.02655436626309, "grad_norm": 0.2577674388885498, "learning_rate": 0.001, "loss": 1.9277, "step": 308200 }, { "epoch": 18.03240334561619, "grad_norm": 0.23772627115249634, "learning_rate": 0.001, "loss": 1.9264, "step": 308300 }, { "epoch": 18.038252324969292, "grad_norm": 0.15751619637012482, "learning_rate": 0.001, "loss": 1.928, "step": 308400 }, { "epoch": 18.044101304322396, "grad_norm": 0.2766701281070709, "learning_rate": 0.001, "loss": 1.9277, "step": 308500 }, { "epoch": 18.0499502836755, "grad_norm": 0.22739019989967346, "learning_rate": 0.001, "loss": 1.9218, "step": 308600 }, { "epoch": 18.0557992630286, "grad_norm": 0.2565115988254547, "learning_rate": 0.001, "loss": 1.9253, "step": 308700 }, { "epoch": 18.061648242381704, "grad_norm": 0.23688481748104095, "learning_rate": 0.001, "loss": 1.9274, "step": 308800 }, { "epoch": 18.067497221734808, "grad_norm": 0.2522827982902527, "learning_rate": 0.001, "loss": 1.9316, "step": 308900 }, { "epoch": 18.07334620108791, "grad_norm": 0.2697749435901642, "learning_rate": 0.001, "loss": 1.9275, "step": 309000 }, { "epoch": 18.079195180441012, "grad_norm": 0.1612410992383957, "learning_rate": 0.001, "loss": 1.9274, "step": 309100 }, { "epoch": 18.085044159794116, "grad_norm": 0.207436665892601, "learning_rate": 0.001, "loss": 1.9303, "step": 309200 }, { "epoch": 18.09089313914722, "grad_norm": 0.2743658125400543, "learning_rate": 0.001, "loss": 1.9336, "step": 309300 }, { "epoch": 18.096742118500323, "grad_norm": 0.23372870683670044, "learning_rate": 0.001, "loss": 1.9278, "step": 309400 }, { "epoch": 18.102591097853423, "grad_norm": 0.338800311088562, "learning_rate": 0.001, "loss": 1.9317, "step": 309500 }, { "epoch": 18.108440077206527, "grad_norm": 0.18332459032535553, "learning_rate": 0.001, "loss": 1.9346, "step": 309600 }, { "epoch": 18.11428905655963, "grad_norm": 0.18658562004566193, "learning_rate": 0.001, "loss": 1.9347, "step": 309700 }, { "epoch": 18.120138035912735, "grad_norm": 0.18554525077342987, "learning_rate": 0.001, "loss": 1.9321, "step": 309800 }, { "epoch": 18.125987015265835, "grad_norm": 0.21249736845493317, "learning_rate": 0.001, "loss": 1.9276, "step": 309900 }, { "epoch": 18.13183599461894, "grad_norm": 0.25892964005470276, "learning_rate": 0.001, "loss": 1.9331, "step": 310000 }, { "epoch": 18.137684973972043, "grad_norm": 0.30948683619499207, "learning_rate": 0.001, "loss": 1.9307, "step": 310100 }, { "epoch": 18.143533953325147, "grad_norm": 0.2167530208826065, "learning_rate": 0.001, "loss": 1.9284, "step": 310200 }, { "epoch": 18.149382932678247, "grad_norm": 0.20357146859169006, "learning_rate": 0.001, "loss": 1.9282, "step": 310300 }, { "epoch": 18.15523191203135, "grad_norm": 0.2650754153728485, "learning_rate": 0.001, "loss": 1.9318, "step": 310400 }, { "epoch": 18.161080891384454, "grad_norm": 0.26636868715286255, "learning_rate": 0.001, "loss": 1.9364, "step": 310500 }, { "epoch": 18.166929870737555, "grad_norm": 0.24999219179153442, "learning_rate": 0.001, "loss": 1.9299, "step": 310600 }, { "epoch": 18.17277885009066, "grad_norm": 0.23913021385669708, "learning_rate": 0.001, "loss": 1.932, "step": 310700 }, { "epoch": 18.178627829443762, "grad_norm": 0.20610623061656952, "learning_rate": 0.001, "loss": 1.9282, "step": 310800 }, { "epoch": 18.184476808796866, "grad_norm": 0.2533024549484253, "learning_rate": 0.001, "loss": 1.9291, "step": 310900 }, { "epoch": 18.190325788149966, "grad_norm": 0.16906137764453888, "learning_rate": 0.001, "loss": 1.9258, "step": 311000 }, { "epoch": 18.19617476750307, "grad_norm": 0.20412631332874298, "learning_rate": 0.001, "loss": 1.9294, "step": 311100 }, { "epoch": 18.202023746856174, "grad_norm": 0.282183438539505, "learning_rate": 0.001, "loss": 1.9305, "step": 311200 }, { "epoch": 18.207872726209278, "grad_norm": 0.29730668663978577, "learning_rate": 0.001, "loss": 1.9274, "step": 311300 }, { "epoch": 18.213721705562378, "grad_norm": 0.20742468535900116, "learning_rate": 0.001, "loss": 1.9321, "step": 311400 }, { "epoch": 18.21957068491548, "grad_norm": 0.2576863467693329, "learning_rate": 0.001, "loss": 1.929, "step": 311500 }, { "epoch": 18.225419664268586, "grad_norm": 0.20479761064052582, "learning_rate": 0.001, "loss": 1.9289, "step": 311600 }, { "epoch": 18.23126864362169, "grad_norm": 0.24074363708496094, "learning_rate": 0.001, "loss": 1.9316, "step": 311700 }, { "epoch": 18.23711762297479, "grad_norm": 0.1958894431591034, "learning_rate": 0.001, "loss": 1.9374, "step": 311800 }, { "epoch": 18.242966602327893, "grad_norm": 0.23745808005332947, "learning_rate": 0.001, "loss": 1.9327, "step": 311900 }, { "epoch": 18.248815581680997, "grad_norm": 0.2236100435256958, "learning_rate": 0.001, "loss": 1.9314, "step": 312000 }, { "epoch": 18.2546645610341, "grad_norm": 0.14753299951553345, "learning_rate": 0.001, "loss": 1.9305, "step": 312100 }, { "epoch": 18.2605135403872, "grad_norm": 0.20223373174667358, "learning_rate": 0.001, "loss": 1.9253, "step": 312200 }, { "epoch": 18.266362519740305, "grad_norm": 0.2412073314189911, "learning_rate": 0.001, "loss": 1.9309, "step": 312300 }, { "epoch": 18.27221149909341, "grad_norm": 0.2539530098438263, "learning_rate": 0.001, "loss": 1.9264, "step": 312400 }, { "epoch": 18.278060478446513, "grad_norm": 0.27049311995506287, "learning_rate": 0.001, "loss": 1.939, "step": 312500 }, { "epoch": 18.283909457799613, "grad_norm": 0.2517223358154297, "learning_rate": 0.001, "loss": 1.9331, "step": 312600 }, { "epoch": 18.289758437152717, "grad_norm": 0.21028171479701996, "learning_rate": 0.001, "loss": 1.9311, "step": 312700 }, { "epoch": 18.29560741650582, "grad_norm": 0.21401461958885193, "learning_rate": 0.001, "loss": 1.9369, "step": 312800 }, { "epoch": 18.301456395858924, "grad_norm": 0.16939279437065125, "learning_rate": 0.001, "loss": 1.9346, "step": 312900 }, { "epoch": 18.307305375212025, "grad_norm": 0.1955314427614212, "learning_rate": 0.001, "loss": 1.9252, "step": 313000 }, { "epoch": 18.31315435456513, "grad_norm": 0.22075851261615753, "learning_rate": 0.001, "loss": 1.933, "step": 313100 }, { "epoch": 18.319003333918232, "grad_norm": 0.16511844098567963, "learning_rate": 0.001, "loss": 1.9268, "step": 313200 }, { "epoch": 18.324852313271336, "grad_norm": 0.22939902544021606, "learning_rate": 0.001, "loss": 1.9315, "step": 313300 }, { "epoch": 18.330701292624436, "grad_norm": 0.21406109631061554, "learning_rate": 0.001, "loss": 1.9316, "step": 313400 }, { "epoch": 18.33655027197754, "grad_norm": 0.21634475886821747, "learning_rate": 0.001, "loss": 1.9311, "step": 313500 }, { "epoch": 18.342399251330644, "grad_norm": 0.18981929123401642, "learning_rate": 0.001, "loss": 1.9329, "step": 313600 }, { "epoch": 18.348248230683744, "grad_norm": 0.20753800868988037, "learning_rate": 0.001, "loss": 1.9356, "step": 313700 }, { "epoch": 18.354097210036848, "grad_norm": 0.21788734197616577, "learning_rate": 0.001, "loss": 1.9353, "step": 313800 }, { "epoch": 18.35994618938995, "grad_norm": 0.27464497089385986, "learning_rate": 0.001, "loss": 1.9275, "step": 313900 }, { "epoch": 18.365795168743055, "grad_norm": 0.2805996537208557, "learning_rate": 0.001, "loss": 1.9347, "step": 314000 }, { "epoch": 18.371644148096156, "grad_norm": 0.21761642396450043, "learning_rate": 0.001, "loss": 1.9305, "step": 314100 }, { "epoch": 18.37749312744926, "grad_norm": 0.2555422782897949, "learning_rate": 0.001, "loss": 1.9336, "step": 314200 }, { "epoch": 18.383342106802363, "grad_norm": 0.23383139073848724, "learning_rate": 0.001, "loss": 1.9326, "step": 314300 }, { "epoch": 18.389191086155467, "grad_norm": 0.24915537238121033, "learning_rate": 0.001, "loss": 1.9372, "step": 314400 }, { "epoch": 18.395040065508567, "grad_norm": 0.21654380857944489, "learning_rate": 0.001, "loss": 1.9353, "step": 314500 }, { "epoch": 18.40088904486167, "grad_norm": 0.19306959211826324, "learning_rate": 0.001, "loss": 1.9324, "step": 314600 }, { "epoch": 18.406738024214775, "grad_norm": 0.23307904601097107, "learning_rate": 0.001, "loss": 1.9294, "step": 314700 }, { "epoch": 18.41258700356788, "grad_norm": 0.25946250557899475, "learning_rate": 0.001, "loss": 1.9363, "step": 314800 }, { "epoch": 18.41843598292098, "grad_norm": 0.2124115228652954, "learning_rate": 0.001, "loss": 1.9377, "step": 314900 }, { "epoch": 18.424284962274083, "grad_norm": 0.17603275179862976, "learning_rate": 0.001, "loss": 1.9357, "step": 315000 }, { "epoch": 18.430133941627187, "grad_norm": 0.19406448304653168, "learning_rate": 0.001, "loss": 1.9318, "step": 315100 }, { "epoch": 18.43598292098029, "grad_norm": 0.2186225801706314, "learning_rate": 0.001, "loss": 1.9275, "step": 315200 }, { "epoch": 18.44183190033339, "grad_norm": 0.26899200677871704, "learning_rate": 0.001, "loss": 1.9326, "step": 315300 }, { "epoch": 18.447680879686494, "grad_norm": 0.22040551900863647, "learning_rate": 0.001, "loss": 1.9317, "step": 315400 }, { "epoch": 18.453529859039598, "grad_norm": 0.2692653238773346, "learning_rate": 0.001, "loss": 1.9306, "step": 315500 }, { "epoch": 18.459378838392702, "grad_norm": 0.19093593955039978, "learning_rate": 0.001, "loss": 1.9332, "step": 315600 }, { "epoch": 18.465227817745802, "grad_norm": 0.23056864738464355, "learning_rate": 0.001, "loss": 1.9276, "step": 315700 }, { "epoch": 18.471076797098906, "grad_norm": 0.240451380610466, "learning_rate": 0.001, "loss": 1.9332, "step": 315800 }, { "epoch": 18.47692577645201, "grad_norm": 0.26848313212394714, "learning_rate": 0.001, "loss": 1.9336, "step": 315900 }, { "epoch": 18.482774755805114, "grad_norm": 0.16756375133991241, "learning_rate": 0.001, "loss": 1.9269, "step": 316000 }, { "epoch": 18.488623735158214, "grad_norm": 0.2332996428012848, "learning_rate": 0.001, "loss": 1.9293, "step": 316100 }, { "epoch": 18.494472714511318, "grad_norm": 0.22997532784938812, "learning_rate": 0.001, "loss": 1.9364, "step": 316200 }, { "epoch": 18.50032169386442, "grad_norm": 0.23993171751499176, "learning_rate": 0.001, "loss": 1.9348, "step": 316300 }, { "epoch": 18.506170673217525, "grad_norm": 0.1900825798511505, "learning_rate": 0.001, "loss": 1.9296, "step": 316400 }, { "epoch": 18.512019652570626, "grad_norm": 0.22587850689888, "learning_rate": 0.001, "loss": 1.933, "step": 316500 }, { "epoch": 18.51786863192373, "grad_norm": 0.19188326597213745, "learning_rate": 0.001, "loss": 1.9288, "step": 316600 }, { "epoch": 18.523717611276833, "grad_norm": 0.17770954966545105, "learning_rate": 0.001, "loss": 1.9309, "step": 316700 }, { "epoch": 18.529566590629933, "grad_norm": 0.21405623853206635, "learning_rate": 0.001, "loss": 1.9323, "step": 316800 }, { "epoch": 18.535415569983037, "grad_norm": 0.19212375581264496, "learning_rate": 0.001, "loss": 1.9333, "step": 316900 }, { "epoch": 18.54126454933614, "grad_norm": 0.2617257535457611, "learning_rate": 0.001, "loss": 1.9447, "step": 317000 }, { "epoch": 18.547113528689245, "grad_norm": 0.23038695752620697, "learning_rate": 0.001, "loss": 1.941, "step": 317100 }, { "epoch": 18.552962508042345, "grad_norm": 0.23507943749427795, "learning_rate": 0.001, "loss": 1.9332, "step": 317200 }, { "epoch": 18.55881148739545, "grad_norm": 0.2217400074005127, "learning_rate": 0.001, "loss": 1.938, "step": 317300 }, { "epoch": 18.564660466748553, "grad_norm": 0.22467897832393646, "learning_rate": 0.001, "loss": 1.9342, "step": 317400 }, { "epoch": 18.570509446101656, "grad_norm": 0.17814381420612335, "learning_rate": 0.001, "loss": 1.9337, "step": 317500 }, { "epoch": 18.576358425454757, "grad_norm": 0.2980905771255493, "learning_rate": 0.001, "loss": 1.9387, "step": 317600 }, { "epoch": 18.58220740480786, "grad_norm": 0.2046939730644226, "learning_rate": 0.001, "loss": 1.9334, "step": 317700 }, { "epoch": 18.588056384160964, "grad_norm": 0.23499973118305206, "learning_rate": 0.001, "loss": 1.9314, "step": 317800 }, { "epoch": 18.593905363514068, "grad_norm": 0.257149338722229, "learning_rate": 0.001, "loss": 1.9394, "step": 317900 }, { "epoch": 18.59975434286717, "grad_norm": 0.27882224321365356, "learning_rate": 0.001, "loss": 1.9328, "step": 318000 }, { "epoch": 18.605603322220272, "grad_norm": 0.20154021680355072, "learning_rate": 0.001, "loss": 1.9359, "step": 318100 }, { "epoch": 18.611452301573376, "grad_norm": 0.1940736025571823, "learning_rate": 0.001, "loss": 1.9278, "step": 318200 }, { "epoch": 18.61730128092648, "grad_norm": 0.22476302087306976, "learning_rate": 0.001, "loss": 1.9347, "step": 318300 }, { "epoch": 18.62315026027958, "grad_norm": 0.2277461141347885, "learning_rate": 0.001, "loss": 1.9331, "step": 318400 }, { "epoch": 18.628999239632684, "grad_norm": 0.2538410425186157, "learning_rate": 0.001, "loss": 1.9357, "step": 318500 }, { "epoch": 18.634848218985788, "grad_norm": 0.23504558205604553, "learning_rate": 0.001, "loss": 1.9341, "step": 318600 }, { "epoch": 18.64069719833889, "grad_norm": 0.15709306299686432, "learning_rate": 0.001, "loss": 1.941, "step": 318700 }, { "epoch": 18.64654617769199, "grad_norm": 0.23162133991718292, "learning_rate": 0.001, "loss": 1.9338, "step": 318800 }, { "epoch": 18.652395157045095, "grad_norm": 0.1988559365272522, "learning_rate": 0.001, "loss": 1.9312, "step": 318900 }, { "epoch": 18.6582441363982, "grad_norm": 0.18961964547634125, "learning_rate": 0.001, "loss": 1.9291, "step": 319000 }, { "epoch": 18.664093115751303, "grad_norm": 0.22819578647613525, "learning_rate": 0.001, "loss": 1.9336, "step": 319100 }, { "epoch": 18.669942095104403, "grad_norm": 0.24958768486976624, "learning_rate": 0.001, "loss": 1.9361, "step": 319200 }, { "epoch": 18.675791074457507, "grad_norm": 0.175850510597229, "learning_rate": 0.001, "loss": 1.9371, "step": 319300 }, { "epoch": 18.68164005381061, "grad_norm": 0.21777726709842682, "learning_rate": 0.001, "loss": 1.932, "step": 319400 }, { "epoch": 18.687489033163715, "grad_norm": 0.2122231274843216, "learning_rate": 0.001, "loss": 1.9371, "step": 319500 }, { "epoch": 18.693338012516815, "grad_norm": 0.20598562061786652, "learning_rate": 0.001, "loss": 1.9396, "step": 319600 }, { "epoch": 18.69918699186992, "grad_norm": 0.1907041072845459, "learning_rate": 0.001, "loss": 1.9331, "step": 319700 }, { "epoch": 18.705035971223023, "grad_norm": 0.3466949462890625, "learning_rate": 0.001, "loss": 1.938, "step": 319800 }, { "epoch": 18.710884950576123, "grad_norm": 0.23499339818954468, "learning_rate": 0.001, "loss": 1.9346, "step": 319900 }, { "epoch": 18.716733929929227, "grad_norm": 0.15244954824447632, "learning_rate": 0.001, "loss": 1.9313, "step": 320000 }, { "epoch": 18.72258290928233, "grad_norm": 0.23682300746440887, "learning_rate": 0.001, "loss": 1.9345, "step": 320100 }, { "epoch": 18.728431888635434, "grad_norm": 0.26577314734458923, "learning_rate": 0.001, "loss": 1.9343, "step": 320200 }, { "epoch": 18.734280867988534, "grad_norm": 0.2560271620750427, "learning_rate": 0.001, "loss": 1.933, "step": 320300 }, { "epoch": 18.74012984734164, "grad_norm": 0.20610709488391876, "learning_rate": 0.001, "loss": 1.9351, "step": 320400 }, { "epoch": 18.745978826694742, "grad_norm": 0.3448598384857178, "learning_rate": 0.001, "loss": 1.9386, "step": 320500 }, { "epoch": 18.751827806047846, "grad_norm": 0.17108173668384552, "learning_rate": 0.001, "loss": 1.9381, "step": 320600 }, { "epoch": 18.757676785400946, "grad_norm": 0.19307543337345123, "learning_rate": 0.001, "loss": 1.9377, "step": 320700 }, { "epoch": 18.76352576475405, "grad_norm": 0.21763166785240173, "learning_rate": 0.001, "loss": 1.9394, "step": 320800 }, { "epoch": 18.769374744107154, "grad_norm": 0.22722221910953522, "learning_rate": 0.001, "loss": 1.9331, "step": 320900 }, { "epoch": 18.775223723460257, "grad_norm": 0.26126518845558167, "learning_rate": 0.001, "loss": 1.9297, "step": 321000 }, { "epoch": 18.781072702813358, "grad_norm": 0.26727375388145447, "learning_rate": 0.001, "loss": 1.9353, "step": 321100 }, { "epoch": 18.78692168216646, "grad_norm": 0.22584015130996704, "learning_rate": 0.001, "loss": 1.9273, "step": 321200 }, { "epoch": 18.792770661519565, "grad_norm": 0.24690447747707367, "learning_rate": 0.001, "loss": 1.9333, "step": 321300 }, { "epoch": 18.79861964087267, "grad_norm": 0.22139619290828705, "learning_rate": 0.001, "loss": 1.9338, "step": 321400 }, { "epoch": 18.80446862022577, "grad_norm": 0.2864610552787781, "learning_rate": 0.001, "loss": 1.937, "step": 321500 }, { "epoch": 18.810317599578873, "grad_norm": 0.27540943026542664, "learning_rate": 0.001, "loss": 1.9356, "step": 321600 }, { "epoch": 18.816166578931977, "grad_norm": 0.18972404301166534, "learning_rate": 0.001, "loss": 1.9346, "step": 321700 }, { "epoch": 18.82201555828508, "grad_norm": 0.2558997571468353, "learning_rate": 0.001, "loss": 1.9297, "step": 321800 }, { "epoch": 18.82786453763818, "grad_norm": 0.26690083742141724, "learning_rate": 0.001, "loss": 1.9337, "step": 321900 }, { "epoch": 18.833713516991285, "grad_norm": 0.2634173631668091, "learning_rate": 0.001, "loss": 1.9387, "step": 322000 }, { "epoch": 18.83956249634439, "grad_norm": 0.1924644559621811, "learning_rate": 0.001, "loss": 1.9314, "step": 322100 }, { "epoch": 18.845411475697492, "grad_norm": 0.26470065116882324, "learning_rate": 0.001, "loss": 1.9369, "step": 322200 }, { "epoch": 18.851260455050593, "grad_norm": 0.23879918456077576, "learning_rate": 0.001, "loss": 1.9313, "step": 322300 }, { "epoch": 18.857109434403696, "grad_norm": 0.1784610003232956, "learning_rate": 0.001, "loss": 1.9324, "step": 322400 }, { "epoch": 18.8629584137568, "grad_norm": 0.19422508776187897, "learning_rate": 0.001, "loss": 1.9302, "step": 322500 }, { "epoch": 18.868807393109904, "grad_norm": 0.23871001601219177, "learning_rate": 0.001, "loss": 1.9352, "step": 322600 }, { "epoch": 18.874656372463004, "grad_norm": 0.25180310010910034, "learning_rate": 0.001, "loss": 1.9326, "step": 322700 }, { "epoch": 18.880505351816108, "grad_norm": 0.1677384078502655, "learning_rate": 0.001, "loss": 1.9328, "step": 322800 }, { "epoch": 18.886354331169212, "grad_norm": 0.16522617638111115, "learning_rate": 0.001, "loss": 1.9331, "step": 322900 }, { "epoch": 18.892203310522312, "grad_norm": 0.209886834025383, "learning_rate": 0.001, "loss": 1.9345, "step": 323000 }, { "epoch": 18.898052289875416, "grad_norm": 0.18697498738765717, "learning_rate": 0.001, "loss": 1.9435, "step": 323100 }, { "epoch": 18.90390126922852, "grad_norm": 0.22314664721488953, "learning_rate": 0.001, "loss": 1.9364, "step": 323200 }, { "epoch": 18.909750248581624, "grad_norm": 0.25124919414520264, "learning_rate": 0.001, "loss": 1.9324, "step": 323300 }, { "epoch": 18.915599227934724, "grad_norm": 0.2595070004463196, "learning_rate": 0.001, "loss": 1.942, "step": 323400 }, { "epoch": 18.921448207287828, "grad_norm": 0.21519462764263153, "learning_rate": 0.001, "loss": 1.938, "step": 323500 }, { "epoch": 18.92729718664093, "grad_norm": 0.22979673743247986, "learning_rate": 0.001, "loss": 1.9316, "step": 323600 }, { "epoch": 18.933146165994035, "grad_norm": 0.1657300442457199, "learning_rate": 0.001, "loss": 1.9302, "step": 323700 }, { "epoch": 18.938995145347135, "grad_norm": 0.20260418951511383, "learning_rate": 0.001, "loss": 1.9313, "step": 323800 }, { "epoch": 18.94484412470024, "grad_norm": 0.21975353360176086, "learning_rate": 0.001, "loss": 1.935, "step": 323900 }, { "epoch": 18.950693104053343, "grad_norm": 0.23747476935386658, "learning_rate": 0.001, "loss": 1.9289, "step": 324000 }, { "epoch": 18.956542083406447, "grad_norm": 0.26488372683525085, "learning_rate": 0.001, "loss": 1.9411, "step": 324100 }, { "epoch": 18.962391062759547, "grad_norm": 0.24531471729278564, "learning_rate": 0.001, "loss": 1.9449, "step": 324200 }, { "epoch": 18.96824004211265, "grad_norm": 0.23738595843315125, "learning_rate": 0.001, "loss": 1.9338, "step": 324300 }, { "epoch": 18.974089021465755, "grad_norm": 0.24494440853595734, "learning_rate": 0.001, "loss": 1.9331, "step": 324400 }, { "epoch": 18.97993800081886, "grad_norm": 0.2118752896785736, "learning_rate": 0.001, "loss": 1.9374, "step": 324500 }, { "epoch": 18.98578698017196, "grad_norm": 0.2432660162448883, "learning_rate": 0.001, "loss": 1.9385, "step": 324600 }, { "epoch": 18.991635959525063, "grad_norm": 0.18446767330169678, "learning_rate": 0.001, "loss": 1.9358, "step": 324700 }, { "epoch": 18.997484938878166, "grad_norm": 0.2709226608276367, "learning_rate": 0.001, "loss": 1.9326, "step": 324800 }, { "epoch": 19.00333391823127, "grad_norm": 0.2262198030948639, "learning_rate": 0.001, "loss": 1.93, "step": 324900 }, { "epoch": 19.00918289758437, "grad_norm": 0.19341738522052765, "learning_rate": 0.001, "loss": 1.92, "step": 325000 }, { "epoch": 19.00918289758437, "eval_ag_news_accuracy": 0.238859375, "eval_ag_news_bleu_score": 6.266292235422775, "eval_ag_news_bleu_score_sem": 0.4176068815884829, "eval_ag_news_emb_cos_sim": 0.691103458404541, "eval_ag_news_emb_cos_sim_sem": 0.015364017337560654, "eval_ag_news_emb_top1_equal": 0.9296875, "eval_ag_news_emb_top1_equal_sem": 0.022687306627631187, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.73986554145813, "eval_ag_news_n_ngrams_match_1": 13.7890625, "eval_ag_news_n_ngrams_match_2": 3.7734375, "eval_ag_news_n_ngrams_match_3": 1.2734375, "eval_ag_news_num_pred_words": 48.25, "eval_ag_news_num_true_words": 45.6328125, "eval_ag_news_perplexity": 15.484902878895014, "eval_ag_news_pred_num_tokens": 72.5625, "eval_ag_news_rouge_score": 0.2844357481886929, "eval_ag_news_runtime": 37.6031, "eval_ag_news_samples_per_second": 13.297, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.31491887501166393, "eval_ag_news_token_set_f1_sem": 0.009457948478785295, "eval_ag_news_token_set_precision": 0.2974193946756636, "eval_ag_news_token_set_recall": 0.34669667134115617, "eval_ag_news_true_num_tokens": 62.2890625, "step": 325000 }, { "epoch": 19.00918289758437, "eval_anthropic_toxic_prompts_accuracy": 0.10215625, "eval_anthropic_toxic_prompts_bleu_score": 37.15890991209547, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.3891493556461874, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8685302734375, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011363836005330086, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.984375, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.011004959233105183, "eval_anthropic_toxic_prompts_exact_match": 0.078125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.023813825861573653, "eval_anthropic_toxic_prompts_loss": 1.3049360513687134, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.3046875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.3671875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.421875, "eval_anthropic_toxic_prompts_num_pred_words": 16.8671875, "eval_anthropic_toxic_prompts_num_true_words": 15.890625, "eval_anthropic_toxic_prompts_perplexity": 3.687453278574971, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.6875, "eval_anthropic_toxic_prompts_rouge_score": 0.6427034142456067, "eval_anthropic_toxic_prompts_runtime": 29.9023, "eval_anthropic_toxic_prompts_samples_per_second": 16.721, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.6636647438392579, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.0190208900755219, "eval_anthropic_toxic_prompts_token_set_precision": 0.6784715717973653, "eval_anthropic_toxic_prompts_token_set_recall": 0.6604786487083338, "eval_anthropic_toxic_prompts_true_num_tokens": 19.2578125, "step": 325000 }, { "epoch": 19.00918289758437, "eval_arxiv_accuracy": 0.37140625, "eval_arxiv_bleu_score": 1.6671994422877328, "eval_arxiv_bleu_score_sem": 0.14970482196481952, "eval_arxiv_emb_cos_sim": 0.42222094535827637, "eval_arxiv_emb_cos_sim_sem": 0.0172741636633873, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.454901933670044, "eval_arxiv_n_ngrams_match_1": 13.515625, "eval_arxiv_n_ngrams_match_2": 2.203125, "eval_arxiv_n_ngrams_match_3": 0.3828125, "eval_arxiv_num_pred_words": 56.046875, "eval_arxiv_num_true_words": 86.4453125, "eval_arxiv_perplexity": 31.655184222315075, "eval_arxiv_pred_num_tokens": 125.46875, "eval_arxiv_rouge_score": 0.1778905965012732, "eval_arxiv_runtime": 30.7799, "eval_arxiv_samples_per_second": 16.244, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.17078922836223484, "eval_arxiv_token_set_f1_sem": 0.008715554838122632, "eval_arxiv_token_set_precision": 0.115153639419256, "eval_arxiv_token_set_recall": 0.39502520770199645, "eval_arxiv_true_num_tokens": 123.9140625, "step": 325000 }, { "epoch": 19.00918289758437, "eval_python_code_alpaca_accuracy": 0.1275625, "eval_python_code_alpaca_bleu_score": 25.41957084275441, "eval_python_code_alpaca_bleu_score_sem": 1.6403611163764338, "eval_python_code_alpaca_emb_cos_sim": 0.8470686674118042, "eval_python_code_alpaca_emb_cos_sim_sem": 0.01117282547056675, "eval_python_code_alpaca_emb_top1_equal": 0.984375, "eval_python_code_alpaca_emb_top1_equal_sem": 0.011004959233105183, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5450462102890015, "eval_python_code_alpaca_n_ngrams_match_1": 10.1484375, "eval_python_code_alpaca_n_ngrams_match_2": 5.28125, "eval_python_code_alpaca_n_ngrams_match_3": 2.859375, "eval_python_code_alpaca_num_pred_words": 17.65625, "eval_python_code_alpaca_num_true_words": 18.4453125, "eval_python_code_alpaca_perplexity": 4.6881882645511315, "eval_python_code_alpaca_pred_num_tokens": 24.0234375, "eval_python_code_alpaca_rouge_score": 0.5764518452290843, "eval_python_code_alpaca_runtime": 30.6428, "eval_python_code_alpaca_samples_per_second": 16.317, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.592025560357571, "eval_python_code_alpaca_token_set_f1_sem": 0.013588725164996747, "eval_python_code_alpaca_token_set_precision": 0.5843211269569812, "eval_python_code_alpaca_token_set_recall": 0.6043663417043023, "eval_python_code_alpaca_true_num_tokens": 23.8828125, "step": 325000 }, { "epoch": 19.00918289758437, "eval_wikibio_accuracy": 0.35953125, "eval_wikibio_bleu_score": 6.685142502071509, "eval_wikibio_bleu_score_sem": 0.7029360763738545, "eval_wikibio_emb_cos_sim": 0.5495723485946655, "eval_wikibio_emb_cos_sim_sem": 0.02429945580661297, "eval_wikibio_emb_top1_equal": 0.90625, "eval_wikibio_emb_top1_equal_sem": 0.025864720344543457, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.75512957572937, "eval_wikibio_n_ngrams_match_1": 13.7265625, "eval_wikibio_n_ngrams_match_2": 4.4453125, "eval_wikibio_n_ngrams_match_3": 1.796875, "eval_wikibio_num_pred_words": 49.875, "eval_wikibio_num_true_words": 53.453125, "eval_wikibio_perplexity": 15.72307810012785, "eval_wikibio_pred_num_tokens": 107.921875, "eval_wikibio_rouge_score": 0.26642917346239137, "eval_wikibio_runtime": 30.6823, "eval_wikibio_samples_per_second": 16.296, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.2901205238935178, "eval_wikibio_token_set_f1_sem": 0.014174675015832685, "eval_wikibio_token_set_precision": 0.2569177262886858, "eval_wikibio_token_set_recall": 0.3766051910150452, "eval_wikibio_true_num_tokens": 102.2578125, "step": 325000 }, { "epoch": 19.00918289758437, "eval_msmarco_accuracy": 0.388109375, "eval_msmarco_bleu_score": 15.52748980122209, "eval_msmarco_bleu_score_sem": 1.4634752144158925, "eval_msmarco_emb_cos_sim": 0.7499059438705444, "eval_msmarco_emb_cos_sim_sem": 0.01728220097720623, "eval_msmarco_emb_top1_equal": 0.921875, "eval_msmarco_emb_top1_equal_sem": 0.023813825100660324, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8057570457458496, "eval_msmarco_n_ngrams_match_1": 27.3515625, "eval_msmarco_n_ngrams_match_2": 12.4609375, "eval_msmarco_n_ngrams_match_3": 7.03125, "eval_msmarco_num_pred_words": 62.3828125, "eval_msmarco_num_true_words": 62.34375, "eval_msmarco_perplexity": 6.084576007751404, "eval_msmarco_pred_num_tokens": 84.984375, "eval_msmarco_rouge_score": 0.4248777804267086, "eval_msmarco_runtime": 25.5205, "eval_msmarco_samples_per_second": 19.592, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.45462938496122046, "eval_msmarco_token_set_f1_sem": 0.014198288388011692, "eval_msmarco_token_set_precision": 0.41315843248191536, "eval_msmarco_token_set_recall": 0.5284728122749048, "eval_msmarco_true_num_tokens": 80.4609375, "step": 325000 }, { "epoch": 19.015031876937474, "grad_norm": 0.1795017272233963, "learning_rate": 0.001, "loss": 1.9208, "step": 325100 }, { "epoch": 19.020880856290578, "grad_norm": 0.22016407549381256, "learning_rate": 0.001, "loss": 1.9182, "step": 325200 }, { "epoch": 19.026729835643682, "grad_norm": 0.29245826601982117, "learning_rate": 0.001, "loss": 1.9196, "step": 325300 }, { "epoch": 19.032578814996782, "grad_norm": 0.25427189469337463, "learning_rate": 0.001, "loss": 1.9172, "step": 325400 }, { "epoch": 19.038427794349886, "grad_norm": 0.23167693614959717, "learning_rate": 0.001, "loss": 1.921, "step": 325500 }, { "epoch": 19.04427677370299, "grad_norm": 0.22339697182178497, "learning_rate": 0.001, "loss": 1.9161, "step": 325600 }, { "epoch": 19.050125753056093, "grad_norm": 0.18079610168933868, "learning_rate": 0.001, "loss": 1.9174, "step": 325700 }, { "epoch": 19.055974732409194, "grad_norm": 0.24337364733219147, "learning_rate": 0.001, "loss": 1.917, "step": 325800 }, { "epoch": 19.061823711762298, "grad_norm": 0.19538140296936035, "learning_rate": 0.001, "loss": 1.9262, "step": 325900 }, { "epoch": 19.0676726911154, "grad_norm": 0.2587549388408661, "learning_rate": 0.001, "loss": 1.9252, "step": 326000 }, { "epoch": 19.0735216704685, "grad_norm": 0.2531103193759918, "learning_rate": 0.001, "loss": 1.9212, "step": 326100 }, { "epoch": 19.079370649821605, "grad_norm": 0.18799199163913727, "learning_rate": 0.001, "loss": 1.9204, "step": 326200 }, { "epoch": 19.08521962917471, "grad_norm": 0.19558009505271912, "learning_rate": 0.001, "loss": 1.9238, "step": 326300 }, { "epoch": 19.091068608527813, "grad_norm": 0.19290263950824738, "learning_rate": 0.001, "loss": 1.9199, "step": 326400 }, { "epoch": 19.096917587880913, "grad_norm": 0.16572707891464233, "learning_rate": 0.001, "loss": 1.9163, "step": 326500 }, { "epoch": 19.102766567234017, "grad_norm": 0.2221585214138031, "learning_rate": 0.001, "loss": 1.9241, "step": 326600 }, { "epoch": 19.10861554658712, "grad_norm": 0.21760106086730957, "learning_rate": 0.001, "loss": 1.9197, "step": 326700 }, { "epoch": 19.114464525940225, "grad_norm": 0.2518640458583832, "learning_rate": 0.001, "loss": 1.9205, "step": 326800 }, { "epoch": 19.120313505293325, "grad_norm": 0.22285114228725433, "learning_rate": 0.001, "loss": 1.9244, "step": 326900 }, { "epoch": 19.12616248464643, "grad_norm": 0.18601615726947784, "learning_rate": 0.001, "loss": 1.9232, "step": 327000 }, { "epoch": 19.132011463999532, "grad_norm": 0.18095695972442627, "learning_rate": 0.001, "loss": 1.9229, "step": 327100 }, { "epoch": 19.137860443352636, "grad_norm": 0.21146444976329803, "learning_rate": 0.001, "loss": 1.9192, "step": 327200 }, { "epoch": 19.143709422705737, "grad_norm": 0.31318238377571106, "learning_rate": 0.001, "loss": 1.9267, "step": 327300 }, { "epoch": 19.14955840205884, "grad_norm": 0.18221637606620789, "learning_rate": 0.001, "loss": 1.9285, "step": 327400 }, { "epoch": 19.155407381411944, "grad_norm": 0.20249392092227936, "learning_rate": 0.001, "loss": 1.9178, "step": 327500 }, { "epoch": 19.161256360765048, "grad_norm": 0.20886947214603424, "learning_rate": 0.001, "loss": 1.9223, "step": 327600 }, { "epoch": 19.167105340118148, "grad_norm": 0.2567766010761261, "learning_rate": 0.001, "loss": 1.9232, "step": 327700 }, { "epoch": 19.172954319471252, "grad_norm": 0.21668311953544617, "learning_rate": 0.001, "loss": 1.9277, "step": 327800 }, { "epoch": 19.178803298824356, "grad_norm": 0.21578992903232574, "learning_rate": 0.001, "loss": 1.9243, "step": 327900 }, { "epoch": 19.18465227817746, "grad_norm": 0.19384923577308655, "learning_rate": 0.001, "loss": 1.917, "step": 328000 }, { "epoch": 19.19050125753056, "grad_norm": 0.1891591101884842, "learning_rate": 0.001, "loss": 1.9225, "step": 328100 }, { "epoch": 19.196350236883664, "grad_norm": 0.17318473756313324, "learning_rate": 0.001, "loss": 1.9224, "step": 328200 }, { "epoch": 19.202199216236767, "grad_norm": 0.24497990310192108, "learning_rate": 0.001, "loss": 1.9213, "step": 328300 }, { "epoch": 19.20804819558987, "grad_norm": 0.23646759986877441, "learning_rate": 0.001, "loss": 1.9219, "step": 328400 }, { "epoch": 19.21389717494297, "grad_norm": 0.21329189836978912, "learning_rate": 0.001, "loss": 1.9279, "step": 328500 }, { "epoch": 19.219746154296075, "grad_norm": 0.23010417819023132, "learning_rate": 0.001, "loss": 1.9261, "step": 328600 }, { "epoch": 19.22559513364918, "grad_norm": 0.2151850461959839, "learning_rate": 0.001, "loss": 1.9272, "step": 328700 }, { "epoch": 19.231444113002283, "grad_norm": 0.22959071397781372, "learning_rate": 0.001, "loss": 1.9223, "step": 328800 }, { "epoch": 19.237293092355383, "grad_norm": 0.23098771274089813, "learning_rate": 0.001, "loss": 1.9238, "step": 328900 }, { "epoch": 19.243142071708487, "grad_norm": 0.20450322329998016, "learning_rate": 0.001, "loss": 1.9226, "step": 329000 }, { "epoch": 19.24899105106159, "grad_norm": 0.25348731875419617, "learning_rate": 0.001, "loss": 1.9223, "step": 329100 }, { "epoch": 19.25484003041469, "grad_norm": 0.23342545330524445, "learning_rate": 0.001, "loss": 1.9269, "step": 329200 }, { "epoch": 19.260689009767795, "grad_norm": 0.225315123796463, "learning_rate": 0.001, "loss": 1.918, "step": 329300 }, { "epoch": 19.2665379891209, "grad_norm": 0.28075599670410156, "learning_rate": 0.001, "loss": 1.9279, "step": 329400 }, { "epoch": 19.272386968474002, "grad_norm": 0.27996471524238586, "learning_rate": 0.001, "loss": 1.9242, "step": 329500 }, { "epoch": 19.278235947827103, "grad_norm": 0.21661217510700226, "learning_rate": 0.001, "loss": 1.9277, "step": 329600 }, { "epoch": 19.284084927180206, "grad_norm": 0.21129964292049408, "learning_rate": 0.001, "loss": 1.9273, "step": 329700 }, { "epoch": 19.28993390653331, "grad_norm": 0.23607119917869568, "learning_rate": 0.001, "loss": 1.9224, "step": 329800 }, { "epoch": 19.295782885886414, "grad_norm": 0.25720518827438354, "learning_rate": 0.001, "loss": 1.9283, "step": 329900 }, { "epoch": 19.301631865239514, "grad_norm": 0.17902258038520813, "learning_rate": 0.001, "loss": 1.9253, "step": 330000 }, { "epoch": 19.307480844592618, "grad_norm": 0.22502392530441284, "learning_rate": 0.001, "loss": 1.9257, "step": 330100 }, { "epoch": 19.313329823945722, "grad_norm": 0.2627086043357849, "learning_rate": 0.001, "loss": 1.9217, "step": 330200 }, { "epoch": 19.319178803298826, "grad_norm": 0.17747698724269867, "learning_rate": 0.001, "loss": 1.9174, "step": 330300 }, { "epoch": 19.325027782651926, "grad_norm": 0.2623894214630127, "learning_rate": 0.001, "loss": 1.9241, "step": 330400 }, { "epoch": 19.33087676200503, "grad_norm": 0.24575242400169373, "learning_rate": 0.001, "loss": 1.9267, "step": 330500 }, { "epoch": 19.336725741358133, "grad_norm": 0.18571704626083374, "learning_rate": 0.001, "loss": 1.9296, "step": 330600 }, { "epoch": 19.342574720711237, "grad_norm": 0.22168536484241486, "learning_rate": 0.001, "loss": 1.9284, "step": 330700 }, { "epoch": 19.348423700064338, "grad_norm": 0.19373413920402527, "learning_rate": 0.001, "loss": 1.9273, "step": 330800 }, { "epoch": 19.35427267941744, "grad_norm": 0.24474336206912994, "learning_rate": 0.001, "loss": 1.9224, "step": 330900 }, { "epoch": 19.360121658770545, "grad_norm": 0.1878669410943985, "learning_rate": 0.001, "loss": 1.9264, "step": 331000 }, { "epoch": 19.36597063812365, "grad_norm": 0.27080923318862915, "learning_rate": 0.001, "loss": 1.9251, "step": 331100 }, { "epoch": 19.37181961747675, "grad_norm": 0.18298007547855377, "learning_rate": 0.001, "loss": 1.9296, "step": 331200 }, { "epoch": 19.377668596829853, "grad_norm": 0.2429371327161789, "learning_rate": 0.001, "loss": 1.9198, "step": 331300 }, { "epoch": 19.383517576182957, "grad_norm": 0.14770011603832245, "learning_rate": 0.001, "loss": 1.9207, "step": 331400 }, { "epoch": 19.38936655553606, "grad_norm": 0.24816492199897766, "learning_rate": 0.001, "loss": 1.924, "step": 331500 }, { "epoch": 19.39521553488916, "grad_norm": 0.2270064651966095, "learning_rate": 0.001, "loss": 1.9202, "step": 331600 }, { "epoch": 19.401064514242265, "grad_norm": 0.281017541885376, "learning_rate": 0.001, "loss": 1.9254, "step": 331700 }, { "epoch": 19.40691349359537, "grad_norm": 0.22936972975730896, "learning_rate": 0.001, "loss": 1.9298, "step": 331800 }, { "epoch": 19.412762472948472, "grad_norm": 0.1962900459766388, "learning_rate": 0.001, "loss": 1.9212, "step": 331900 }, { "epoch": 19.418611452301572, "grad_norm": 0.23298344016075134, "learning_rate": 0.001, "loss": 1.9246, "step": 332000 }, { "epoch": 19.424460431654676, "grad_norm": 0.24769024550914764, "learning_rate": 0.001, "loss": 1.927, "step": 332100 }, { "epoch": 19.43030941100778, "grad_norm": 0.20074273645877838, "learning_rate": 0.001, "loss": 1.929, "step": 332200 }, { "epoch": 19.43615839036088, "grad_norm": 0.19219569861888885, "learning_rate": 0.001, "loss": 1.9319, "step": 332300 }, { "epoch": 19.442007369713984, "grad_norm": 0.23259995877742767, "learning_rate": 0.001, "loss": 1.9248, "step": 332400 }, { "epoch": 19.447856349067088, "grad_norm": 0.22118261456489563, "learning_rate": 0.001, "loss": 1.9245, "step": 332500 }, { "epoch": 19.45370532842019, "grad_norm": 0.23445190489292145, "learning_rate": 0.001, "loss": 1.9261, "step": 332600 }, { "epoch": 19.459554307773292, "grad_norm": 0.2376803606748581, "learning_rate": 0.001, "loss": 1.9238, "step": 332700 }, { "epoch": 19.465403287126396, "grad_norm": 0.23591110110282898, "learning_rate": 0.001, "loss": 1.9262, "step": 332800 }, { "epoch": 19.4712522664795, "grad_norm": 0.19915832579135895, "learning_rate": 0.001, "loss": 1.9336, "step": 332900 }, { "epoch": 19.477101245832603, "grad_norm": 0.2429502010345459, "learning_rate": 0.001, "loss": 1.925, "step": 333000 }, { "epoch": 19.482950225185704, "grad_norm": 0.18787513673305511, "learning_rate": 0.001, "loss": 1.925, "step": 333100 }, { "epoch": 19.488799204538807, "grad_norm": 0.21414950489997864, "learning_rate": 0.001, "loss": 1.9264, "step": 333200 }, { "epoch": 19.49464818389191, "grad_norm": 0.20206014811992645, "learning_rate": 0.001, "loss": 1.9291, "step": 333300 }, { "epoch": 19.500497163245015, "grad_norm": 0.2581467926502228, "learning_rate": 0.001, "loss": 1.9285, "step": 333400 }, { "epoch": 19.506346142598115, "grad_norm": 0.2623070776462555, "learning_rate": 0.001, "loss": 1.9231, "step": 333500 }, { "epoch": 19.51219512195122, "grad_norm": 0.20621541142463684, "learning_rate": 0.001, "loss": 1.9246, "step": 333600 }, { "epoch": 19.518044101304323, "grad_norm": 0.2762615978717804, "learning_rate": 0.001, "loss": 1.9172, "step": 333700 }, { "epoch": 19.523893080657427, "grad_norm": 0.28858911991119385, "learning_rate": 0.001, "loss": 1.9337, "step": 333800 }, { "epoch": 19.529742060010527, "grad_norm": 0.25015002489089966, "learning_rate": 0.001, "loss": 1.9219, "step": 333900 }, { "epoch": 19.53559103936363, "grad_norm": 0.27083438634872437, "learning_rate": 0.001, "loss": 1.9223, "step": 334000 }, { "epoch": 19.541440018716735, "grad_norm": 0.2909761071205139, "learning_rate": 0.001, "loss": 1.9256, "step": 334100 }, { "epoch": 19.54728899806984, "grad_norm": 0.21253272891044617, "learning_rate": 0.001, "loss": 1.9208, "step": 334200 }, { "epoch": 19.55313797742294, "grad_norm": 0.24407510459423065, "learning_rate": 0.001, "loss": 1.9304, "step": 334300 }, { "epoch": 19.558986956776042, "grad_norm": 0.27976712584495544, "learning_rate": 0.001, "loss": 1.9304, "step": 334400 }, { "epoch": 19.564835936129146, "grad_norm": 0.22181713581085205, "learning_rate": 0.001, "loss": 1.9324, "step": 334500 }, { "epoch": 19.57068491548225, "grad_norm": 0.2501920759677887, "learning_rate": 0.001, "loss": 1.9265, "step": 334600 }, { "epoch": 19.57653389483535, "grad_norm": 0.23765112459659576, "learning_rate": 0.001, "loss": 1.9236, "step": 334700 }, { "epoch": 19.582382874188454, "grad_norm": 0.23590117692947388, "learning_rate": 0.001, "loss": 1.9259, "step": 334800 }, { "epoch": 19.588231853541558, "grad_norm": 0.1953977346420288, "learning_rate": 0.001, "loss": 1.9246, "step": 334900 }, { "epoch": 19.59408083289466, "grad_norm": 0.3108517825603485, "learning_rate": 0.001, "loss": 1.9275, "step": 335000 }, { "epoch": 19.599929812247762, "grad_norm": 0.17553678154945374, "learning_rate": 0.001, "loss": 1.9265, "step": 335100 }, { "epoch": 19.605778791600866, "grad_norm": 0.20280154049396515, "learning_rate": 0.001, "loss": 1.9236, "step": 335200 }, { "epoch": 19.61162777095397, "grad_norm": 0.19005273282527924, "learning_rate": 0.001, "loss": 1.9201, "step": 335300 }, { "epoch": 19.61747675030707, "grad_norm": 0.239686518907547, "learning_rate": 0.001, "loss": 1.9295, "step": 335400 }, { "epoch": 19.623325729660174, "grad_norm": 0.2146284431219101, "learning_rate": 0.001, "loss": 1.9236, "step": 335500 }, { "epoch": 19.629174709013277, "grad_norm": 0.2884178161621094, "learning_rate": 0.001, "loss": 1.9361, "step": 335600 }, { "epoch": 19.63502368836638, "grad_norm": 0.2281053066253662, "learning_rate": 0.001, "loss": 1.9291, "step": 335700 }, { "epoch": 19.64087266771948, "grad_norm": 0.22037020325660706, "learning_rate": 0.001, "loss": 1.9285, "step": 335800 }, { "epoch": 19.646721647072585, "grad_norm": 0.260556697845459, "learning_rate": 0.001, "loss": 1.9269, "step": 335900 }, { "epoch": 19.65257062642569, "grad_norm": 0.23550669848918915, "learning_rate": 0.001, "loss": 1.9287, "step": 336000 }, { "epoch": 19.658419605778793, "grad_norm": 0.24850621819496155, "learning_rate": 0.001, "loss": 1.9253, "step": 336100 }, { "epoch": 19.664268585131893, "grad_norm": 0.1930272877216339, "learning_rate": 0.001, "loss": 1.9311, "step": 336200 }, { "epoch": 19.670117564484997, "grad_norm": 0.2545173466205597, "learning_rate": 0.001, "loss": 1.9237, "step": 336300 }, { "epoch": 19.6759665438381, "grad_norm": 0.2601509988307953, "learning_rate": 0.001, "loss": 1.9256, "step": 336400 }, { "epoch": 19.681815523191204, "grad_norm": 0.23469854891300201, "learning_rate": 0.001, "loss": 1.9307, "step": 336500 }, { "epoch": 19.687664502544305, "grad_norm": 0.19980905950069427, "learning_rate": 0.001, "loss": 1.9272, "step": 336600 }, { "epoch": 19.69351348189741, "grad_norm": 0.31911271810531616, "learning_rate": 0.001, "loss": 1.9278, "step": 336700 }, { "epoch": 19.699362461250512, "grad_norm": 0.21089129149913788, "learning_rate": 0.001, "loss": 1.9243, "step": 336800 }, { "epoch": 19.705211440603616, "grad_norm": 0.2984016537666321, "learning_rate": 0.001, "loss": 1.9269, "step": 336900 }, { "epoch": 19.711060419956716, "grad_norm": 0.18024253845214844, "learning_rate": 0.001, "loss": 1.9274, "step": 337000 }, { "epoch": 19.71690939930982, "grad_norm": 0.22807033360004425, "learning_rate": 0.001, "loss": 1.9258, "step": 337100 }, { "epoch": 19.722758378662924, "grad_norm": 0.2068561166524887, "learning_rate": 0.001, "loss": 1.9228, "step": 337200 }, { "epoch": 19.728607358016028, "grad_norm": 0.17873117327690125, "learning_rate": 0.001, "loss": 1.9286, "step": 337300 }, { "epoch": 19.734456337369128, "grad_norm": 0.23234795033931732, "learning_rate": 0.001, "loss": 1.9311, "step": 337400 }, { "epoch": 19.74030531672223, "grad_norm": 0.22365595400333405, "learning_rate": 0.001, "loss": 1.9261, "step": 337500 }, { "epoch": 19.746154296075336, "grad_norm": 0.2619243860244751, "learning_rate": 0.001, "loss": 1.9228, "step": 337600 }, { "epoch": 19.75200327542844, "grad_norm": 0.21176722645759583, "learning_rate": 0.001, "loss": 1.9288, "step": 337700 }, { "epoch": 19.75785225478154, "grad_norm": 0.1738930493593216, "learning_rate": 0.001, "loss": 1.9276, "step": 337800 }, { "epoch": 19.763701234134643, "grad_norm": 0.18990260362625122, "learning_rate": 0.001, "loss": 1.927, "step": 337900 }, { "epoch": 19.769550213487747, "grad_norm": 0.2223321944475174, "learning_rate": 0.001, "loss": 1.9349, "step": 338000 }, { "epoch": 19.77539919284085, "grad_norm": 0.1931186318397522, "learning_rate": 0.001, "loss": 1.9322, "step": 338100 }, { "epoch": 19.78124817219395, "grad_norm": 0.2088804841041565, "learning_rate": 0.001, "loss": 1.9273, "step": 338200 }, { "epoch": 19.787097151547055, "grad_norm": 0.19158071279525757, "learning_rate": 0.001, "loss": 1.9261, "step": 338300 }, { "epoch": 19.79294613090016, "grad_norm": 0.21184737980365753, "learning_rate": 0.001, "loss": 1.9253, "step": 338400 }, { "epoch": 19.79879511025326, "grad_norm": 0.18309658765792847, "learning_rate": 0.001, "loss": 1.9254, "step": 338500 }, { "epoch": 19.804644089606363, "grad_norm": 0.27354374527931213, "learning_rate": 0.001, "loss": 1.9203, "step": 338600 }, { "epoch": 19.810493068959467, "grad_norm": 0.2580430209636688, "learning_rate": 0.001, "loss": 1.9328, "step": 338700 }, { "epoch": 19.81634204831257, "grad_norm": 0.24944333732128143, "learning_rate": 0.001, "loss": 1.9289, "step": 338800 }, { "epoch": 19.82219102766567, "grad_norm": 0.19598227739334106, "learning_rate": 0.001, "loss": 1.9308, "step": 338900 }, { "epoch": 19.828040007018775, "grad_norm": 0.2523568570613861, "learning_rate": 0.001, "loss": 1.9283, "step": 339000 }, { "epoch": 19.83388898637188, "grad_norm": 0.21466299891471863, "learning_rate": 0.001, "loss": 1.9322, "step": 339100 }, { "epoch": 19.839737965724982, "grad_norm": 0.2383754700422287, "learning_rate": 0.001, "loss": 1.9299, "step": 339200 }, { "epoch": 19.845586945078082, "grad_norm": 0.1988820731639862, "learning_rate": 0.001, "loss": 1.9255, "step": 339300 }, { "epoch": 19.851435924431186, "grad_norm": 0.19230179488658905, "learning_rate": 0.001, "loss": 1.9281, "step": 339400 }, { "epoch": 19.85728490378429, "grad_norm": 0.23831453919410706, "learning_rate": 0.001, "loss": 1.9301, "step": 339500 }, { "epoch": 19.863133883137394, "grad_norm": 0.23255565762519836, "learning_rate": 0.001, "loss": 1.9323, "step": 339600 }, { "epoch": 19.868982862490494, "grad_norm": 0.1579200178384781, "learning_rate": 0.001, "loss": 1.9308, "step": 339700 }, { "epoch": 19.874831841843598, "grad_norm": 0.21617574989795685, "learning_rate": 0.001, "loss": 1.927, "step": 339800 }, { "epoch": 19.8806808211967, "grad_norm": 0.21712931990623474, "learning_rate": 0.001, "loss": 1.9281, "step": 339900 }, { "epoch": 19.886529800549805, "grad_norm": 0.21759022772312164, "learning_rate": 0.001, "loss": 1.9277, "step": 340000 }, { "epoch": 19.892378779902906, "grad_norm": 0.2716425061225891, "learning_rate": 0.001, "loss": 1.9305, "step": 340100 }, { "epoch": 19.89822775925601, "grad_norm": 0.1973758190870285, "learning_rate": 0.001, "loss": 1.9358, "step": 340200 }, { "epoch": 19.904076738609113, "grad_norm": 0.2076849788427353, "learning_rate": 0.001, "loss": 1.928, "step": 340300 }, { "epoch": 19.909925717962217, "grad_norm": 0.2182992845773697, "learning_rate": 0.001, "loss": 1.9245, "step": 340400 }, { "epoch": 19.915774697315317, "grad_norm": 0.19879527390003204, "learning_rate": 0.001, "loss": 1.9244, "step": 340500 }, { "epoch": 19.92162367666842, "grad_norm": 0.2111264318227768, "learning_rate": 0.001, "loss": 1.9277, "step": 340600 }, { "epoch": 19.927472656021525, "grad_norm": 0.20919255912303925, "learning_rate": 0.001, "loss": 1.9255, "step": 340700 }, { "epoch": 19.93332163537463, "grad_norm": 0.251354843378067, "learning_rate": 0.001, "loss": 1.9268, "step": 340800 }, { "epoch": 19.93917061472773, "grad_norm": 0.2604437470436096, "learning_rate": 0.001, "loss": 1.9297, "step": 340900 }, { "epoch": 19.945019594080833, "grad_norm": 0.20664525032043457, "learning_rate": 0.001, "loss": 1.927, "step": 341000 }, { "epoch": 19.950868573433937, "grad_norm": 0.2041202187538147, "learning_rate": 0.001, "loss": 1.9259, "step": 341100 }, { "epoch": 19.95671755278704, "grad_norm": 0.19825054705142975, "learning_rate": 0.001, "loss": 1.9282, "step": 341200 }, { "epoch": 19.96256653214014, "grad_norm": 0.2276562601327896, "learning_rate": 0.001, "loss": 1.9285, "step": 341300 }, { "epoch": 19.968415511493244, "grad_norm": 0.23813997209072113, "learning_rate": 0.001, "loss": 1.9271, "step": 341400 }, { "epoch": 19.97426449084635, "grad_norm": 0.2861003279685974, "learning_rate": 0.001, "loss": 1.927, "step": 341500 }, { "epoch": 19.98011347019945, "grad_norm": 0.289075642824173, "learning_rate": 0.001, "loss": 1.9337, "step": 341600 }, { "epoch": 19.985962449552552, "grad_norm": 0.2730565667152405, "learning_rate": 0.001, "loss": 1.9319, "step": 341700 }, { "epoch": 19.991811428905656, "grad_norm": 0.19367703795433044, "learning_rate": 0.001, "loss": 1.9308, "step": 341800 }, { "epoch": 19.99766040825876, "grad_norm": 0.23940560221672058, "learning_rate": 0.001, "loss": 1.9245, "step": 341900 }, { "epoch": 20.00350938761186, "grad_norm": 0.22808925807476044, "learning_rate": 0.001, "loss": 1.9195, "step": 342000 }, { "epoch": 20.009358366964964, "grad_norm": 0.23025310039520264, "learning_rate": 0.001, "loss": 1.9085, "step": 342100 }, { "epoch": 20.015207346318068, "grad_norm": 0.20927144587039948, "learning_rate": 0.001, "loss": 1.911, "step": 342200 }, { "epoch": 20.02105632567117, "grad_norm": 0.23021385073661804, "learning_rate": 0.001, "loss": 1.9106, "step": 342300 }, { "epoch": 20.026905305024272, "grad_norm": 0.24073879420757294, "learning_rate": 0.001, "loss": 1.9082, "step": 342400 }, { "epoch": 20.032754284377376, "grad_norm": 0.2721453905105591, "learning_rate": 0.001, "loss": 1.9121, "step": 342500 }, { "epoch": 20.03860326373048, "grad_norm": 0.22631338238716125, "learning_rate": 0.001, "loss": 1.9089, "step": 342600 }, { "epoch": 20.044452243083583, "grad_norm": 0.20968365669250488, "learning_rate": 0.001, "loss": 1.9137, "step": 342700 }, { "epoch": 20.050301222436683, "grad_norm": 0.2578289210796356, "learning_rate": 0.001, "loss": 1.9128, "step": 342800 }, { "epoch": 20.056150201789787, "grad_norm": 0.2252637892961502, "learning_rate": 0.001, "loss": 1.9081, "step": 342900 }, { "epoch": 20.06199918114289, "grad_norm": 0.2101564109325409, "learning_rate": 0.001, "loss": 1.9168, "step": 343000 }, { "epoch": 20.067848160495995, "grad_norm": 0.1878087818622589, "learning_rate": 0.001, "loss": 1.9093, "step": 343100 }, { "epoch": 20.073697139849095, "grad_norm": 0.23780590295791626, "learning_rate": 0.001, "loss": 1.9112, "step": 343200 }, { "epoch": 20.0795461192022, "grad_norm": 0.16460175812244415, "learning_rate": 0.001, "loss": 1.9148, "step": 343300 }, { "epoch": 20.085395098555303, "grad_norm": 0.23249273002147675, "learning_rate": 0.001, "loss": 1.9143, "step": 343400 }, { "epoch": 20.091244077908406, "grad_norm": 0.16927289962768555, "learning_rate": 0.001, "loss": 1.9067, "step": 343500 }, { "epoch": 20.097093057261507, "grad_norm": 0.31622055172920227, "learning_rate": 0.001, "loss": 1.9161, "step": 343600 }, { "epoch": 20.10294203661461, "grad_norm": 0.22840777039527893, "learning_rate": 0.001, "loss": 1.9122, "step": 343700 }, { "epoch": 20.108791015967714, "grad_norm": 0.25140663981437683, "learning_rate": 0.001, "loss": 1.91, "step": 343800 }, { "epoch": 20.114639995320818, "grad_norm": 0.30096709728240967, "learning_rate": 0.001, "loss": 1.9201, "step": 343900 }, { "epoch": 20.12048897467392, "grad_norm": 0.22653532028198242, "learning_rate": 0.001, "loss": 1.9209, "step": 344000 }, { "epoch": 20.126337954027022, "grad_norm": 0.17663908004760742, "learning_rate": 0.001, "loss": 1.9183, "step": 344100 }, { "epoch": 20.132186933380126, "grad_norm": 0.20906473696231842, "learning_rate": 0.001, "loss": 1.9184, "step": 344200 }, { "epoch": 20.13803591273323, "grad_norm": 0.22733084857463837, "learning_rate": 0.001, "loss": 1.9133, "step": 344300 }, { "epoch": 20.14388489208633, "grad_norm": 0.23026421666145325, "learning_rate": 0.001, "loss": 1.9219, "step": 344400 }, { "epoch": 20.149733871439434, "grad_norm": 0.22536955773830414, "learning_rate": 0.001, "loss": 1.9157, "step": 344500 }, { "epoch": 20.155582850792538, "grad_norm": 0.22636069357395172, "learning_rate": 0.001, "loss": 1.9148, "step": 344600 }, { "epoch": 20.161431830145638, "grad_norm": 0.18618625402450562, "learning_rate": 0.001, "loss": 1.9173, "step": 344700 }, { "epoch": 20.16728080949874, "grad_norm": 0.2300073504447937, "learning_rate": 0.001, "loss": 1.9227, "step": 344800 }, { "epoch": 20.173129788851845, "grad_norm": 0.24294574558734894, "learning_rate": 0.001, "loss": 1.9122, "step": 344900 }, { "epoch": 20.17897876820495, "grad_norm": 0.27121761441230774, "learning_rate": 0.001, "loss": 1.9159, "step": 345000 }, { "epoch": 20.18482774755805, "grad_norm": 0.2513693869113922, "learning_rate": 0.001, "loss": 1.9128, "step": 345100 }, { "epoch": 20.190676726911153, "grad_norm": 0.2211308628320694, "learning_rate": 0.001, "loss": 1.9139, "step": 345200 }, { "epoch": 20.196525706264257, "grad_norm": 0.2552067041397095, "learning_rate": 0.001, "loss": 1.9193, "step": 345300 }, { "epoch": 20.20237468561736, "grad_norm": 0.1993148922920227, "learning_rate": 0.001, "loss": 1.9154, "step": 345400 }, { "epoch": 20.20822366497046, "grad_norm": 0.21850456297397614, "learning_rate": 0.001, "loss": 1.9125, "step": 345500 }, { "epoch": 20.214072644323565, "grad_norm": 0.21106696128845215, "learning_rate": 0.001, "loss": 1.9169, "step": 345600 }, { "epoch": 20.21992162367667, "grad_norm": 0.2612873911857605, "learning_rate": 0.001, "loss": 1.9212, "step": 345700 }, { "epoch": 20.225770603029773, "grad_norm": 0.2533285319805145, "learning_rate": 0.001, "loss": 1.9179, "step": 345800 }, { "epoch": 20.231619582382873, "grad_norm": 0.22167378664016724, "learning_rate": 0.001, "loss": 1.9259, "step": 345900 }, { "epoch": 20.237468561735977, "grad_norm": 0.25230294466018677, "learning_rate": 0.001, "loss": 1.9201, "step": 346000 }, { "epoch": 20.24331754108908, "grad_norm": 0.18510569632053375, "learning_rate": 0.001, "loss": 1.9238, "step": 346100 }, { "epoch": 20.249166520442184, "grad_norm": 0.24107272922992706, "learning_rate": 0.001, "loss": 1.909, "step": 346200 }, { "epoch": 20.255015499795284, "grad_norm": 0.3788105249404907, "learning_rate": 0.001, "loss": 1.9178, "step": 346300 }, { "epoch": 20.26086447914839, "grad_norm": 0.25827595591545105, "learning_rate": 0.001, "loss": 1.919, "step": 346400 }, { "epoch": 20.266713458501492, "grad_norm": 0.3389303386211395, "learning_rate": 0.001, "loss": 1.9143, "step": 346500 }, { "epoch": 20.272562437854596, "grad_norm": 0.3137296736240387, "learning_rate": 0.001, "loss": 1.9175, "step": 346600 }, { "epoch": 20.278411417207696, "grad_norm": 0.2601909339427948, "learning_rate": 0.001, "loss": 1.9173, "step": 346700 }, { "epoch": 20.2842603965608, "grad_norm": 0.2387191653251648, "learning_rate": 0.001, "loss": 1.9179, "step": 346800 }, { "epoch": 20.290109375913904, "grad_norm": 0.19245432317256927, "learning_rate": 0.001, "loss": 1.9263, "step": 346900 }, { "epoch": 20.295958355267008, "grad_norm": 0.23094753921031952, "learning_rate": 0.001, "loss": 1.9107, "step": 347000 }, { "epoch": 20.301807334620108, "grad_norm": 0.25302761793136597, "learning_rate": 0.001, "loss": 1.9173, "step": 347100 }, { "epoch": 20.30765631397321, "grad_norm": 0.19899901747703552, "learning_rate": 0.001, "loss": 1.9208, "step": 347200 }, { "epoch": 20.313505293326315, "grad_norm": 0.22500891983509064, "learning_rate": 0.001, "loss": 1.9129, "step": 347300 }, { "epoch": 20.31935427267942, "grad_norm": 0.26669254899024963, "learning_rate": 0.001, "loss": 1.9172, "step": 347400 }, { "epoch": 20.32520325203252, "grad_norm": 0.2760084271430969, "learning_rate": 0.001, "loss": 1.9221, "step": 347500 }, { "epoch": 20.331052231385623, "grad_norm": 0.25278589129447937, "learning_rate": 0.001, "loss": 1.9181, "step": 347600 }, { "epoch": 20.336901210738727, "grad_norm": 0.21337521076202393, "learning_rate": 0.001, "loss": 1.9179, "step": 347700 }, { "epoch": 20.342750190091827, "grad_norm": 0.2982942759990692, "learning_rate": 0.001, "loss": 1.9189, "step": 347800 }, { "epoch": 20.34859916944493, "grad_norm": 0.22592109441757202, "learning_rate": 0.001, "loss": 1.9209, "step": 347900 }, { "epoch": 20.354448148798035, "grad_norm": 0.1879177838563919, "learning_rate": 0.001, "loss": 1.9214, "step": 348000 }, { "epoch": 20.36029712815114, "grad_norm": 0.21947066485881805, "learning_rate": 0.001, "loss": 1.9194, "step": 348100 }, { "epoch": 20.36614610750424, "grad_norm": 0.24339383840560913, "learning_rate": 0.001, "loss": 1.9235, "step": 348200 }, { "epoch": 20.371995086857343, "grad_norm": 0.2480156421661377, "learning_rate": 0.001, "loss": 1.9268, "step": 348300 }, { "epoch": 20.377844066210447, "grad_norm": 0.2675359845161438, "learning_rate": 0.001, "loss": 1.9251, "step": 348400 }, { "epoch": 20.38369304556355, "grad_norm": 0.2513958215713501, "learning_rate": 0.001, "loss": 1.9196, "step": 348500 }, { "epoch": 20.38954202491665, "grad_norm": 0.23525914549827576, "learning_rate": 0.001, "loss": 1.918, "step": 348600 }, { "epoch": 20.395391004269754, "grad_norm": 0.20655667781829834, "learning_rate": 0.001, "loss": 1.9137, "step": 348700 }, { "epoch": 20.401239983622858, "grad_norm": 0.19758914411067963, "learning_rate": 0.001, "loss": 1.9212, "step": 348800 }, { "epoch": 20.407088962975962, "grad_norm": 0.24538826942443848, "learning_rate": 0.001, "loss": 1.9172, "step": 348900 }, { "epoch": 20.412937942329062, "grad_norm": 0.22834700345993042, "learning_rate": 0.001, "loss": 1.9212, "step": 349000 }, { "epoch": 20.418786921682166, "grad_norm": 0.2570667564868927, "learning_rate": 0.001, "loss": 1.9205, "step": 349100 }, { "epoch": 20.42463590103527, "grad_norm": 0.24109676480293274, "learning_rate": 0.001, "loss": 1.9235, "step": 349200 }, { "epoch": 20.430484880388374, "grad_norm": 0.23217622935771942, "learning_rate": 0.001, "loss": 1.9227, "step": 349300 }, { "epoch": 20.436333859741474, "grad_norm": 0.2779260575771332, "learning_rate": 0.001, "loss": 1.9154, "step": 349400 }, { "epoch": 20.442182839094578, "grad_norm": 0.2725665271282196, "learning_rate": 0.001, "loss": 1.9226, "step": 349500 }, { "epoch": 20.44803181844768, "grad_norm": 0.21813629567623138, "learning_rate": 0.001, "loss": 1.917, "step": 349600 }, { "epoch": 20.453880797800785, "grad_norm": 0.2411060482263565, "learning_rate": 0.001, "loss": 1.9203, "step": 349700 }, { "epoch": 20.459729777153886, "grad_norm": 0.2226746678352356, "learning_rate": 0.001, "loss": 1.9171, "step": 349800 }, { "epoch": 20.46557875650699, "grad_norm": 0.2652744650840759, "learning_rate": 0.001, "loss": 1.9191, "step": 349900 }, { "epoch": 20.471427735860093, "grad_norm": 0.2252127230167389, "learning_rate": 0.001, "loss": 1.92, "step": 350000 }, { "epoch": 20.471427735860093, "eval_ag_news_accuracy": 0.238296875, "eval_ag_news_bleu_score": 7.2236216103781405, "eval_ag_news_bleu_score_sem": 0.43649729112121066, "eval_ag_news_emb_cos_sim": 0.7053764462471008, "eval_ag_news_emb_cos_sim_sem": 0.014971710741519928, "eval_ag_news_emb_top1_equal": 0.9296875, "eval_ag_news_emb_top1_equal_sem": 0.022687306627631187, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.746917486190796, "eval_ag_news_n_ngrams_match_1": 14.953125, "eval_ag_news_n_ngrams_match_2": 4.3984375, "eval_ag_news_n_ngrams_match_3": 1.59375, "eval_ag_news_num_pred_words": 48.8984375, "eval_ag_news_num_true_words": 46.3359375, "eval_ag_news_perplexity": 15.594487496384874, "eval_ag_news_pred_num_tokens": 71.8125, "eval_ag_news_rouge_score": 0.29813324219241255, "eval_ag_news_runtime": 37.6147, "eval_ag_news_samples_per_second": 13.293, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.34039954770530373, "eval_ag_news_token_set_f1_sem": 0.010001350274950251, "eval_ag_news_token_set_precision": 0.32401762584060845, "eval_ag_news_token_set_recall": 0.36910395452558886, "eval_ag_news_true_num_tokens": 64.171875, "step": 350000 }, { "epoch": 20.471427735860093, "eval_anthropic_toxic_prompts_accuracy": 0.1015, "eval_anthropic_toxic_prompts_bleu_score": 38.177454418461764, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.371092451528668, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8761566281318665, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010570215992629528, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.0859375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02487009494926453, "eval_anthropic_toxic_prompts_loss": 1.2634072303771973, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.3828125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 4.984375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.140625, "eval_anthropic_toxic_prompts_num_pred_words": 13.3515625, "eval_anthropic_toxic_prompts_num_true_words": 13.2109375, "eval_anthropic_toxic_prompts_perplexity": 3.5374538975056566, "eval_anthropic_toxic_prompts_pred_num_tokens": 17.3671875, "eval_anthropic_toxic_prompts_rouge_score": 0.668637834468663, "eval_anthropic_toxic_prompts_runtime": 29.6648, "eval_anthropic_toxic_prompts_samples_per_second": 16.855, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6858193305792751, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018282616022972403, "eval_anthropic_toxic_prompts_token_set_precision": 0.6919693287419776, "eval_anthropic_toxic_prompts_token_set_recall": 0.6849284783122609, "eval_anthropic_toxic_prompts_true_num_tokens": 16.3671875, "step": 350000 }, { "epoch": 20.471427735860093, "eval_arxiv_accuracy": 0.374765625, "eval_arxiv_bleu_score": 1.3930208199285152, "eval_arxiv_bleu_score_sem": 0.1483522524205727, "eval_arxiv_emb_cos_sim": 0.3800150156021118, "eval_arxiv_emb_cos_sim_sem": 0.016112904995679855, "eval_arxiv_emb_top1_equal": 0.8828125, "eval_arxiv_emb_top1_equal_sem": 0.02854125387966633, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.457918882369995, "eval_arxiv_n_ngrams_match_1": 11.4765625, "eval_arxiv_n_ngrams_match_2": 1.6953125, "eval_arxiv_n_ngrams_match_3": 0.2734375, "eval_arxiv_num_pred_words": 51.5, "eval_arxiv_num_true_words": 84.8671875, "eval_arxiv_perplexity": 31.750830496605214, "eval_arxiv_pred_num_tokens": 126.3125, "eval_arxiv_rouge_score": 0.15964770705389142, "eval_arxiv_runtime": 30.8459, "eval_arxiv_samples_per_second": 16.21, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.15898658313879893, "eval_arxiv_token_set_f1_sem": 0.008222406345441366, "eval_arxiv_token_set_precision": 0.10581301592876438, "eval_arxiv_token_set_recall": 0.41682998268232285, "eval_arxiv_true_num_tokens": 124.0546875, "step": 350000 }, { "epoch": 20.471427735860093, "eval_python_code_alpaca_accuracy": 0.1308125, "eval_python_code_alpaca_bleu_score": 28.026086232830597, "eval_python_code_alpaca_bleu_score_sem": 1.6749694354338858, "eval_python_code_alpaca_emb_cos_sim": 0.8731042742729187, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009973104111850262, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.5169028043746948, "eval_python_code_alpaca_n_ngrams_match_1": 10.25, "eval_python_code_alpaca_n_ngrams_match_2": 5.7109375, "eval_python_code_alpaca_n_ngrams_match_3": 3.2734375, "eval_python_code_alpaca_num_pred_words": 16.875, "eval_python_code_alpaca_num_true_words": 17.8046875, "eval_python_code_alpaca_perplexity": 4.55808602645819, "eval_python_code_alpaca_pred_num_tokens": 22.9296875, "eval_python_code_alpaca_rouge_score": 0.6058963051050674, "eval_python_code_alpaca_runtime": 29.4964, "eval_python_code_alpaca_samples_per_second": 16.951, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6149648060584465, "eval_python_code_alpaca_token_set_f1_sem": 0.0132206800805239, "eval_python_code_alpaca_token_set_precision": 0.6032622412392028, "eval_python_code_alpaca_token_set_recall": 0.6330825387321589, "eval_python_code_alpaca_true_num_tokens": 23.1171875, "step": 350000 }, { "epoch": 20.471427735860093, "eval_wikibio_accuracy": 0.365, "eval_wikibio_bleu_score": 6.63060637248903, "eval_wikibio_bleu_score_sem": 0.6270844813508349, "eval_wikibio_emb_cos_sim": 0.5457330942153931, "eval_wikibio_emb_cos_sim_sem": 0.02316853404045105, "eval_wikibio_emb_top1_equal": 0.8515625, "eval_wikibio_emb_top1_equal_sem": 0.03154846653342247, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.707827568054199, "eval_wikibio_n_ngrams_match_1": 14.765625, "eval_wikibio_n_ngrams_match_2": 4.765625, "eval_wikibio_n_ngrams_match_3": 1.875, "eval_wikibio_num_pred_words": 54.2734375, "eval_wikibio_num_true_words": 52.5234375, "eval_wikibio_perplexity": 14.996660875993307, "eval_wikibio_pred_num_tokens": 109.609375, "eval_wikibio_rouge_score": 0.2783223270086963, "eval_wikibio_runtime": 31.1111, "eval_wikibio_samples_per_second": 16.071, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.313727737006609, "eval_wikibio_token_set_f1_sem": 0.013091739900533283, "eval_wikibio_token_set_precision": 0.2757137299033257, "eval_wikibio_token_set_recall": 0.40886927275857776, "eval_wikibio_true_num_tokens": 100.734375, "step": 350000 }, { "epoch": 20.471427735860093, "eval_msmarco_accuracy": 0.384328125, "eval_msmarco_bleu_score": 16.9166970172107, "eval_msmarco_bleu_score_sem": 1.3371934788070876, "eval_msmarco_emb_cos_sim": 0.7485939264297485, "eval_msmarco_emb_cos_sim_sem": 0.019328752532601357, "eval_msmarco_emb_top1_equal": 0.921875, "eval_msmarco_emb_top1_equal_sem": 0.023813825100660324, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8054684400558472, "eval_msmarco_n_ngrams_match_1": 27.5078125, "eval_msmarco_n_ngrams_match_2": 12.3828125, "eval_msmarco_n_ngrams_match_3": 6.875, "eval_msmarco_num_pred_words": 61.5234375, "eval_msmarco_num_true_words": 59.4453125, "eval_msmarco_perplexity": 6.082820217871976, "eval_msmarco_pred_num_tokens": 83.265625, "eval_msmarco_rouge_score": 0.4433124608549184, "eval_msmarco_runtime": 25.068, "eval_msmarco_samples_per_second": 19.946, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.4755053700035903, "eval_msmarco_token_set_f1_sem": 0.014212557262561125, "eval_msmarco_token_set_precision": 0.4407401926766514, "eval_msmarco_token_set_recall": 0.5335812569575825, "eval_msmarco_true_num_tokens": 76.7734375, "step": 350000 }, { "epoch": 20.477276715213197, "grad_norm": 0.2135695368051529, "learning_rate": 0.001, "loss": 1.9198, "step": 350100 }, { "epoch": 20.483125694566297, "grad_norm": 0.2137782871723175, "learning_rate": 0.001, "loss": 1.9226, "step": 350200 }, { "epoch": 20.4889746739194, "grad_norm": 0.23824729025363922, "learning_rate": 0.001, "loss": 1.9197, "step": 350300 }, { "epoch": 20.494823653272505, "grad_norm": 0.2220480740070343, "learning_rate": 0.001, "loss": 1.9258, "step": 350400 }, { "epoch": 20.50067263262561, "grad_norm": 0.2546806335449219, "learning_rate": 0.001, "loss": 1.9227, "step": 350500 }, { "epoch": 20.50652161197871, "grad_norm": 0.3058742880821228, "learning_rate": 0.001, "loss": 1.9211, "step": 350600 }, { "epoch": 20.512370591331813, "grad_norm": 0.3469679355621338, "learning_rate": 0.001, "loss": 1.9247, "step": 350700 }, { "epoch": 20.518219570684916, "grad_norm": 0.21234732866287231, "learning_rate": 0.001, "loss": 1.9144, "step": 350800 }, { "epoch": 20.524068550038017, "grad_norm": 0.26769623160362244, "learning_rate": 0.001, "loss": 1.9199, "step": 350900 }, { "epoch": 20.52991752939112, "grad_norm": 0.18405944108963013, "learning_rate": 0.001, "loss": 1.9223, "step": 351000 }, { "epoch": 20.535766508744224, "grad_norm": 0.17586560547351837, "learning_rate": 0.001, "loss": 1.9219, "step": 351100 }, { "epoch": 20.541615488097328, "grad_norm": 0.24097074568271637, "learning_rate": 0.001, "loss": 1.9187, "step": 351200 }, { "epoch": 20.54746446745043, "grad_norm": 0.21067045629024506, "learning_rate": 0.001, "loss": 1.9138, "step": 351300 }, { "epoch": 20.553313446803532, "grad_norm": 0.23664098978042603, "learning_rate": 0.001, "loss": 1.9235, "step": 351400 }, { "epoch": 20.559162426156636, "grad_norm": 0.19946105778217316, "learning_rate": 0.001, "loss": 1.9248, "step": 351500 }, { "epoch": 20.56501140550974, "grad_norm": 0.2291213423013687, "learning_rate": 0.001, "loss": 1.9187, "step": 351600 }, { "epoch": 20.57086038486284, "grad_norm": 0.3009444773197174, "learning_rate": 0.001, "loss": 1.9238, "step": 351700 }, { "epoch": 20.576709364215944, "grad_norm": 0.2238491326570511, "learning_rate": 0.001, "loss": 1.9213, "step": 351800 }, { "epoch": 20.582558343569048, "grad_norm": 0.28537002205848694, "learning_rate": 0.001, "loss": 1.9188, "step": 351900 }, { "epoch": 20.58840732292215, "grad_norm": 0.2382446825504303, "learning_rate": 0.001, "loss": 1.9208, "step": 352000 }, { "epoch": 20.59425630227525, "grad_norm": 0.1812097579240799, "learning_rate": 0.001, "loss": 1.9255, "step": 352100 }, { "epoch": 20.600105281628355, "grad_norm": 0.22001396119594574, "learning_rate": 0.001, "loss": 1.9184, "step": 352200 }, { "epoch": 20.60595426098146, "grad_norm": 0.17225025594234467, "learning_rate": 0.001, "loss": 1.9148, "step": 352300 }, { "epoch": 20.611803240334563, "grad_norm": 0.24468407034873962, "learning_rate": 0.001, "loss": 1.9241, "step": 352400 }, { "epoch": 20.617652219687663, "grad_norm": 0.2859067916870117, "learning_rate": 0.001, "loss": 1.919, "step": 352500 }, { "epoch": 20.623501199040767, "grad_norm": 0.2969032824039459, "learning_rate": 0.001, "loss": 1.9197, "step": 352600 }, { "epoch": 20.62935017839387, "grad_norm": 0.21614857017993927, "learning_rate": 0.001, "loss": 1.926, "step": 352700 }, { "epoch": 20.635199157746975, "grad_norm": 0.22476233541965485, "learning_rate": 0.001, "loss": 1.9222, "step": 352800 }, { "epoch": 20.641048137100075, "grad_norm": 0.18674376606941223, "learning_rate": 0.001, "loss": 1.9188, "step": 352900 }, { "epoch": 20.64689711645318, "grad_norm": 0.2268756628036499, "learning_rate": 0.001, "loss": 1.9249, "step": 353000 }, { "epoch": 20.652746095806283, "grad_norm": 0.22174674272537231, "learning_rate": 0.001, "loss": 1.9184, "step": 353100 }, { "epoch": 20.658595075159386, "grad_norm": 0.2544998228549957, "learning_rate": 0.001, "loss": 1.9178, "step": 353200 }, { "epoch": 20.664444054512487, "grad_norm": 0.2739373743534088, "learning_rate": 0.001, "loss": 1.9221, "step": 353300 }, { "epoch": 20.67029303386559, "grad_norm": 0.1908223181962967, "learning_rate": 0.001, "loss": 1.921, "step": 353400 }, { "epoch": 20.676142013218694, "grad_norm": 0.23233836889266968, "learning_rate": 0.001, "loss": 1.9113, "step": 353500 }, { "epoch": 20.681990992571798, "grad_norm": 0.24064555764198303, "learning_rate": 0.001, "loss": 1.9191, "step": 353600 }, { "epoch": 20.687839971924898, "grad_norm": 0.25279924273490906, "learning_rate": 0.001, "loss": 1.9218, "step": 353700 }, { "epoch": 20.693688951278002, "grad_norm": 0.18459567427635193, "learning_rate": 0.001, "loss": 1.9178, "step": 353800 }, { "epoch": 20.699537930631106, "grad_norm": 0.21922023594379425, "learning_rate": 0.001, "loss": 1.9163, "step": 353900 }, { "epoch": 20.705386909984206, "grad_norm": 0.2298808991909027, "learning_rate": 0.001, "loss": 1.9218, "step": 354000 }, { "epoch": 20.71123588933731, "grad_norm": 0.22195890545845032, "learning_rate": 0.001, "loss": 1.9138, "step": 354100 }, { "epoch": 20.717084868690414, "grad_norm": 0.19816820323467255, "learning_rate": 0.001, "loss": 1.9163, "step": 354200 }, { "epoch": 20.722933848043517, "grad_norm": 0.3129444122314453, "learning_rate": 0.001, "loss": 1.9185, "step": 354300 }, { "epoch": 20.728782827396618, "grad_norm": 0.23578982055187225, "learning_rate": 0.001, "loss": 1.9169, "step": 354400 }, { "epoch": 20.73463180674972, "grad_norm": 0.18305252492427826, "learning_rate": 0.001, "loss": 1.9145, "step": 354500 }, { "epoch": 20.740480786102825, "grad_norm": 0.20788919925689697, "learning_rate": 0.001, "loss": 1.9223, "step": 354600 }, { "epoch": 20.74632976545593, "grad_norm": 0.26482149958610535, "learning_rate": 0.001, "loss": 1.9217, "step": 354700 }, { "epoch": 20.75217874480903, "grad_norm": 0.24310307204723358, "learning_rate": 0.001, "loss": 1.9257, "step": 354800 }, { "epoch": 20.758027724162133, "grad_norm": 0.30752110481262207, "learning_rate": 0.001, "loss": 1.9196, "step": 354900 }, { "epoch": 20.763876703515237, "grad_norm": 0.20390312373638153, "learning_rate": 0.001, "loss": 1.9171, "step": 355000 }, { "epoch": 20.76972568286834, "grad_norm": 0.21201875805854797, "learning_rate": 0.001, "loss": 1.9202, "step": 355100 }, { "epoch": 20.77557466222144, "grad_norm": 0.26384076476097107, "learning_rate": 0.001, "loss": 1.9299, "step": 355200 }, { "epoch": 20.781423641574545, "grad_norm": 0.19244423508644104, "learning_rate": 0.001, "loss": 1.918, "step": 355300 }, { "epoch": 20.78727262092765, "grad_norm": 0.2591624855995178, "learning_rate": 0.001, "loss": 1.917, "step": 355400 }, { "epoch": 20.793121600280752, "grad_norm": 0.23300950229167938, "learning_rate": 0.001, "loss": 1.9197, "step": 355500 }, { "epoch": 20.798970579633853, "grad_norm": 0.20480464398860931, "learning_rate": 0.001, "loss": 1.9211, "step": 355600 }, { "epoch": 20.804819558986956, "grad_norm": 0.23944002389907837, "learning_rate": 0.001, "loss": 1.9249, "step": 355700 }, { "epoch": 20.81066853834006, "grad_norm": 0.26141253113746643, "learning_rate": 0.001, "loss": 1.9174, "step": 355800 }, { "epoch": 20.816517517693164, "grad_norm": 0.3048345446586609, "learning_rate": 0.001, "loss": 1.9248, "step": 355900 }, { "epoch": 20.822366497046264, "grad_norm": 0.2041730433702469, "learning_rate": 0.001, "loss": 1.921, "step": 356000 }, { "epoch": 20.828215476399368, "grad_norm": 0.1529606580734253, "learning_rate": 0.001, "loss": 1.9206, "step": 356100 }, { "epoch": 20.834064455752472, "grad_norm": 0.2207241803407669, "learning_rate": 0.001, "loss": 1.921, "step": 356200 }, { "epoch": 20.839913435105576, "grad_norm": 0.17658869922161102, "learning_rate": 0.001, "loss": 1.9242, "step": 356300 }, { "epoch": 20.845762414458676, "grad_norm": 0.2310008406639099, "learning_rate": 0.001, "loss": 1.921, "step": 356400 }, { "epoch": 20.85161139381178, "grad_norm": 0.20571531355381012, "learning_rate": 0.001, "loss": 1.9201, "step": 356500 }, { "epoch": 20.857460373164884, "grad_norm": 0.22995467483997345, "learning_rate": 0.001, "loss": 1.9218, "step": 356600 }, { "epoch": 20.863309352517987, "grad_norm": 0.20817555487155914, "learning_rate": 0.001, "loss": 1.9243, "step": 356700 }, { "epoch": 20.869158331871088, "grad_norm": 0.18431487679481506, "learning_rate": 0.001, "loss": 1.9177, "step": 356800 }, { "epoch": 20.87500731122419, "grad_norm": 0.24058720469474792, "learning_rate": 0.001, "loss": 1.9233, "step": 356900 }, { "epoch": 20.880856290577295, "grad_norm": 0.2698889970779419, "learning_rate": 0.001, "loss": 1.9206, "step": 357000 }, { "epoch": 20.886705269930395, "grad_norm": 0.215304434299469, "learning_rate": 0.001, "loss": 1.9239, "step": 357100 }, { "epoch": 20.8925542492835, "grad_norm": 0.2959764301776886, "learning_rate": 0.001, "loss": 1.9164, "step": 357200 }, { "epoch": 20.898403228636603, "grad_norm": 0.23445619642734528, "learning_rate": 0.001, "loss": 1.9174, "step": 357300 }, { "epoch": 20.904252207989707, "grad_norm": 0.2755839228630066, "learning_rate": 0.001, "loss": 1.9181, "step": 357400 }, { "epoch": 20.910101187342807, "grad_norm": 0.20591403543949127, "learning_rate": 0.001, "loss": 1.9197, "step": 357500 }, { "epoch": 20.91595016669591, "grad_norm": 0.23050498962402344, "learning_rate": 0.001, "loss": 1.9268, "step": 357600 }, { "epoch": 20.921799146049015, "grad_norm": 0.22775016725063324, "learning_rate": 0.001, "loss": 1.922, "step": 357700 }, { "epoch": 20.92764812540212, "grad_norm": 0.2526380121707916, "learning_rate": 0.001, "loss": 1.9184, "step": 357800 }, { "epoch": 20.93349710475522, "grad_norm": 0.24097010493278503, "learning_rate": 0.001, "loss": 1.9192, "step": 357900 }, { "epoch": 20.939346084108323, "grad_norm": 0.2000662088394165, "learning_rate": 0.001, "loss": 1.9255, "step": 358000 }, { "epoch": 20.945195063461426, "grad_norm": 0.2590339779853821, "learning_rate": 0.001, "loss": 1.9181, "step": 358100 }, { "epoch": 20.95104404281453, "grad_norm": 0.18566519021987915, "learning_rate": 0.001, "loss": 1.9196, "step": 358200 }, { "epoch": 20.95689302216763, "grad_norm": 0.23365838825702667, "learning_rate": 0.001, "loss": 1.9246, "step": 358300 }, { "epoch": 20.962742001520734, "grad_norm": 0.2632681131362915, "learning_rate": 0.001, "loss": 1.9198, "step": 358400 }, { "epoch": 20.968590980873838, "grad_norm": 0.2211712896823883, "learning_rate": 0.001, "loss": 1.9256, "step": 358500 }, { "epoch": 20.974439960226942, "grad_norm": 0.2135019451379776, "learning_rate": 0.001, "loss": 1.9216, "step": 358600 }, { "epoch": 20.980288939580042, "grad_norm": 0.1919899582862854, "learning_rate": 0.001, "loss": 1.9239, "step": 358700 }, { "epoch": 20.986137918933146, "grad_norm": 0.23452189564704895, "learning_rate": 0.001, "loss": 1.9155, "step": 358800 }, { "epoch": 20.99198689828625, "grad_norm": 0.2531946897506714, "learning_rate": 0.001, "loss": 1.9217, "step": 358900 }, { "epoch": 20.997835877639353, "grad_norm": 0.19044344127178192, "learning_rate": 0.001, "loss": 1.9215, "step": 359000 }, { "epoch": 21.003684856992454, "grad_norm": 0.26155680418014526, "learning_rate": 0.001, "loss": 1.9057, "step": 359100 }, { "epoch": 21.009533836345557, "grad_norm": 0.21960175037384033, "learning_rate": 0.001, "loss": 1.9058, "step": 359200 }, { "epoch": 21.01538281569866, "grad_norm": 0.2487984299659729, "learning_rate": 0.001, "loss": 1.905, "step": 359300 }, { "epoch": 21.021231795051765, "grad_norm": 0.21654503047466278, "learning_rate": 0.001, "loss": 1.9088, "step": 359400 }, { "epoch": 21.027080774404865, "grad_norm": 0.3136421740055084, "learning_rate": 0.001, "loss": 1.9054, "step": 359500 }, { "epoch": 21.03292975375797, "grad_norm": 0.2559809386730194, "learning_rate": 0.001, "loss": 1.9065, "step": 359600 }, { "epoch": 21.038778733111073, "grad_norm": 0.23410865664482117, "learning_rate": 0.001, "loss": 1.9081, "step": 359700 }, { "epoch": 21.044627712464177, "grad_norm": 0.2305229753255844, "learning_rate": 0.001, "loss": 1.9038, "step": 359800 }, { "epoch": 21.050476691817277, "grad_norm": 0.2000458687543869, "learning_rate": 0.001, "loss": 1.9056, "step": 359900 }, { "epoch": 21.05632567117038, "grad_norm": 0.2212233990430832, "learning_rate": 0.001, "loss": 1.9104, "step": 360000 }, { "epoch": 21.062174650523485, "grad_norm": 0.25844910740852356, "learning_rate": 0.001, "loss": 1.9043, "step": 360100 }, { "epoch": 21.068023629876585, "grad_norm": 0.23221926391124725, "learning_rate": 0.001, "loss": 1.9014, "step": 360200 }, { "epoch": 21.07387260922969, "grad_norm": 0.25472596287727356, "learning_rate": 0.001, "loss": 1.9027, "step": 360300 }, { "epoch": 21.079721588582792, "grad_norm": 0.22692665457725525, "learning_rate": 0.001, "loss": 1.9098, "step": 360400 }, { "epoch": 21.085570567935896, "grad_norm": 0.21806754171848297, "learning_rate": 0.001, "loss": 1.9047, "step": 360500 }, { "epoch": 21.091419547288996, "grad_norm": 0.2686242461204529, "learning_rate": 0.001, "loss": 1.9089, "step": 360600 }, { "epoch": 21.0972685266421, "grad_norm": 0.2893035113811493, "learning_rate": 0.001, "loss": 1.9212, "step": 360700 }, { "epoch": 21.103117505995204, "grad_norm": 0.33384692668914795, "learning_rate": 0.001, "loss": 1.9095, "step": 360800 }, { "epoch": 21.108966485348308, "grad_norm": 0.2848469614982605, "learning_rate": 0.001, "loss": 1.9136, "step": 360900 }, { "epoch": 21.114815464701408, "grad_norm": 0.2618001103401184, "learning_rate": 0.001, "loss": 1.9067, "step": 361000 }, { "epoch": 21.120664444054512, "grad_norm": 0.3782064914703369, "learning_rate": 0.001, "loss": 1.9135, "step": 361100 }, { "epoch": 21.126513423407616, "grad_norm": 0.19964565336704254, "learning_rate": 0.001, "loss": 1.9131, "step": 361200 }, { "epoch": 21.13236240276072, "grad_norm": 0.23107247054576874, "learning_rate": 0.001, "loss": 1.9131, "step": 361300 }, { "epoch": 21.13821138211382, "grad_norm": 0.3498475253582001, "learning_rate": 0.001, "loss": 1.9114, "step": 361400 }, { "epoch": 21.144060361466924, "grad_norm": 0.26903435587882996, "learning_rate": 0.001, "loss": 1.9053, "step": 361500 }, { "epoch": 21.149909340820027, "grad_norm": 0.20976705849170685, "learning_rate": 0.001, "loss": 1.9049, "step": 361600 }, { "epoch": 21.15575832017313, "grad_norm": 0.24200186133384705, "learning_rate": 0.001, "loss": 1.9093, "step": 361700 }, { "epoch": 21.16160729952623, "grad_norm": 0.21032069623470306, "learning_rate": 0.001, "loss": 1.9107, "step": 361800 }, { "epoch": 21.167456278879335, "grad_norm": 0.34285396337509155, "learning_rate": 0.001, "loss": 1.9078, "step": 361900 }, { "epoch": 21.17330525823244, "grad_norm": 0.22095482051372528, "learning_rate": 0.001, "loss": 1.9128, "step": 362000 }, { "epoch": 21.179154237585543, "grad_norm": 0.20171724259853363, "learning_rate": 0.001, "loss": 1.9143, "step": 362100 }, { "epoch": 21.185003216938643, "grad_norm": 0.26496508717536926, "learning_rate": 0.001, "loss": 1.9134, "step": 362200 }, { "epoch": 21.190852196291747, "grad_norm": 0.21281792223453522, "learning_rate": 0.001, "loss": 1.9084, "step": 362300 }, { "epoch": 21.19670117564485, "grad_norm": 0.19303390383720398, "learning_rate": 0.001, "loss": 1.9041, "step": 362400 }, { "epoch": 21.202550154997954, "grad_norm": 0.2911781072616577, "learning_rate": 0.001, "loss": 1.9125, "step": 362500 }, { "epoch": 21.208399134351055, "grad_norm": 0.23554447293281555, "learning_rate": 0.001, "loss": 1.9114, "step": 362600 }, { "epoch": 21.21424811370416, "grad_norm": 0.24876701831817627, "learning_rate": 0.001, "loss": 1.9111, "step": 362700 }, { "epoch": 21.220097093057262, "grad_norm": 0.26351824402809143, "learning_rate": 0.001, "loss": 1.9099, "step": 362800 }, { "epoch": 21.225946072410366, "grad_norm": 0.2773571312427521, "learning_rate": 0.001, "loss": 1.9099, "step": 362900 }, { "epoch": 21.231795051763466, "grad_norm": 0.19032622873783112, "learning_rate": 0.001, "loss": 1.9107, "step": 363000 }, { "epoch": 21.23764403111657, "grad_norm": 0.32044610381126404, "learning_rate": 0.001, "loss": 1.9141, "step": 363100 }, { "epoch": 21.243493010469674, "grad_norm": 0.28319448232650757, "learning_rate": 0.001, "loss": 1.9106, "step": 363200 }, { "epoch": 21.249341989822774, "grad_norm": 0.28709882497787476, "learning_rate": 0.001, "loss": 1.9092, "step": 363300 }, { "epoch": 21.255190969175878, "grad_norm": 0.24096447229385376, "learning_rate": 0.001, "loss": 1.9121, "step": 363400 }, { "epoch": 21.261039948528982, "grad_norm": 0.20556165277957916, "learning_rate": 0.001, "loss": 1.9109, "step": 363500 }, { "epoch": 21.266888927882086, "grad_norm": 0.26544347405433655, "learning_rate": 0.001, "loss": 1.9176, "step": 363600 }, { "epoch": 21.272737907235186, "grad_norm": 0.2420065701007843, "learning_rate": 0.001, "loss": 1.9095, "step": 363700 }, { "epoch": 21.27858688658829, "grad_norm": 0.19620293378829956, "learning_rate": 0.001, "loss": 1.9089, "step": 363800 }, { "epoch": 21.284435865941393, "grad_norm": 0.26469913125038147, "learning_rate": 0.001, "loss": 1.9066, "step": 363900 }, { "epoch": 21.290284845294497, "grad_norm": 0.1883971393108368, "learning_rate": 0.001, "loss": 1.9133, "step": 364000 }, { "epoch": 21.296133824647598, "grad_norm": 0.28033968806266785, "learning_rate": 0.001, "loss": 1.9082, "step": 364100 }, { "epoch": 21.3019828040007, "grad_norm": 0.26321378350257874, "learning_rate": 0.001, "loss": 1.9093, "step": 364200 }, { "epoch": 21.307831783353805, "grad_norm": 0.2168392539024353, "learning_rate": 0.001, "loss": 1.909, "step": 364300 }, { "epoch": 21.31368076270691, "grad_norm": 0.21585652232170105, "learning_rate": 0.001, "loss": 1.9091, "step": 364400 }, { "epoch": 21.31952974206001, "grad_norm": 0.23123757541179657, "learning_rate": 0.001, "loss": 1.9145, "step": 364500 }, { "epoch": 21.325378721413113, "grad_norm": 0.26507723331451416, "learning_rate": 0.001, "loss": 1.9121, "step": 364600 }, { "epoch": 21.331227700766217, "grad_norm": 0.23130962252616882, "learning_rate": 0.001, "loss": 1.9074, "step": 364700 }, { "epoch": 21.33707668011932, "grad_norm": 0.3013931214809418, "learning_rate": 0.001, "loss": 1.9083, "step": 364800 }, { "epoch": 21.34292565947242, "grad_norm": 0.25971224904060364, "learning_rate": 0.001, "loss": 1.9146, "step": 364900 }, { "epoch": 21.348774638825525, "grad_norm": 0.23321925103664398, "learning_rate": 0.001, "loss": 1.9098, "step": 365000 }, { "epoch": 21.35462361817863, "grad_norm": 0.25657331943511963, "learning_rate": 0.001, "loss": 1.9149, "step": 365100 }, { "epoch": 21.360472597531732, "grad_norm": 0.23586560785770416, "learning_rate": 0.001, "loss": 1.9139, "step": 365200 }, { "epoch": 21.366321576884832, "grad_norm": 0.22450147569179535, "learning_rate": 0.001, "loss": 1.9071, "step": 365300 }, { "epoch": 21.372170556237936, "grad_norm": 0.2477673590183258, "learning_rate": 0.001, "loss": 1.9191, "step": 365400 }, { "epoch": 21.37801953559104, "grad_norm": 0.194982647895813, "learning_rate": 0.001, "loss": 1.9087, "step": 365500 }, { "epoch": 21.383868514944144, "grad_norm": 0.25887128710746765, "learning_rate": 0.001, "loss": 1.9107, "step": 365600 }, { "epoch": 21.389717494297244, "grad_norm": 0.24578475952148438, "learning_rate": 0.001, "loss": 1.9136, "step": 365700 }, { "epoch": 21.395566473650348, "grad_norm": 0.2889209985733032, "learning_rate": 0.001, "loss": 1.9123, "step": 365800 }, { "epoch": 21.40141545300345, "grad_norm": 0.21646015346050262, "learning_rate": 0.001, "loss": 1.913, "step": 365900 }, { "epoch": 21.407264432356556, "grad_norm": 0.20428813993930817, "learning_rate": 0.001, "loss": 1.9082, "step": 366000 }, { "epoch": 21.413113411709656, "grad_norm": 0.1850145310163498, "learning_rate": 0.001, "loss": 1.9127, "step": 366100 }, { "epoch": 21.41896239106276, "grad_norm": 0.30639106035232544, "learning_rate": 0.001, "loss": 1.907, "step": 366200 }, { "epoch": 21.424811370415863, "grad_norm": 0.24757254123687744, "learning_rate": 0.001, "loss": 1.9107, "step": 366300 }, { "epoch": 21.430660349768964, "grad_norm": 0.2512439489364624, "learning_rate": 0.001, "loss": 1.9086, "step": 366400 }, { "epoch": 21.436509329122067, "grad_norm": 0.2895801067352295, "learning_rate": 0.001, "loss": 1.9094, "step": 366500 }, { "epoch": 21.44235830847517, "grad_norm": 0.3122473955154419, "learning_rate": 0.001, "loss": 1.9162, "step": 366600 }, { "epoch": 21.448207287828275, "grad_norm": 0.238468199968338, "learning_rate": 0.001, "loss": 1.9165, "step": 366700 }, { "epoch": 21.454056267181375, "grad_norm": 0.23066329956054688, "learning_rate": 0.001, "loss": 1.9089, "step": 366800 }, { "epoch": 21.45990524653448, "grad_norm": 0.25507253408432007, "learning_rate": 0.001, "loss": 1.9168, "step": 366900 }, { "epoch": 21.465754225887583, "grad_norm": 0.3082094192504883, "learning_rate": 0.001, "loss": 1.9231, "step": 367000 }, { "epoch": 21.471603205240687, "grad_norm": 0.2900315821170807, "learning_rate": 0.001, "loss": 1.917, "step": 367100 }, { "epoch": 21.477452184593787, "grad_norm": 0.20476309955120087, "learning_rate": 0.001, "loss": 1.9166, "step": 367200 }, { "epoch": 21.48330116394689, "grad_norm": 0.24048201739788055, "learning_rate": 0.001, "loss": 1.9046, "step": 367300 }, { "epoch": 21.489150143299995, "grad_norm": 0.20890548825263977, "learning_rate": 0.001, "loss": 1.9114, "step": 367400 }, { "epoch": 21.4949991226531, "grad_norm": 0.25879624485969543, "learning_rate": 0.001, "loss": 1.9133, "step": 367500 }, { "epoch": 21.5008481020062, "grad_norm": 0.23414531350135803, "learning_rate": 0.001, "loss": 1.9132, "step": 367600 }, { "epoch": 21.506697081359302, "grad_norm": 0.2636049687862396, "learning_rate": 0.001, "loss": 1.9164, "step": 367700 }, { "epoch": 21.512546060712406, "grad_norm": 0.24619652330875397, "learning_rate": 0.001, "loss": 1.9102, "step": 367800 }, { "epoch": 21.51839504006551, "grad_norm": 0.3038586676120758, "learning_rate": 0.001, "loss": 1.9172, "step": 367900 }, { "epoch": 21.52424401941861, "grad_norm": 0.22180742025375366, "learning_rate": 0.001, "loss": 1.9154, "step": 368000 }, { "epoch": 21.530092998771714, "grad_norm": 0.2596951127052307, "learning_rate": 0.001, "loss": 1.9088, "step": 368100 }, { "epoch": 21.535941978124818, "grad_norm": 0.2719852924346924, "learning_rate": 0.001, "loss": 1.9156, "step": 368200 }, { "epoch": 21.54179095747792, "grad_norm": 0.37154731154441833, "learning_rate": 0.001, "loss": 1.9166, "step": 368300 }, { "epoch": 21.547639936831022, "grad_norm": 0.24909406900405884, "learning_rate": 0.001, "loss": 1.9151, "step": 368400 }, { "epoch": 21.553488916184126, "grad_norm": 0.2218729704618454, "learning_rate": 0.001, "loss": 1.9178, "step": 368500 }, { "epoch": 21.55933789553723, "grad_norm": 0.20277811586856842, "learning_rate": 0.001, "loss": 1.9167, "step": 368600 }, { "epoch": 21.565186874890333, "grad_norm": 0.2515273094177246, "learning_rate": 0.001, "loss": 1.9127, "step": 368700 }, { "epoch": 21.571035854243434, "grad_norm": 0.2464931607246399, "learning_rate": 0.001, "loss": 1.9154, "step": 368800 }, { "epoch": 21.576884833596537, "grad_norm": 0.21685686707496643, "learning_rate": 0.001, "loss": 1.9132, "step": 368900 }, { "epoch": 21.58273381294964, "grad_norm": 0.2036443054676056, "learning_rate": 0.001, "loss": 1.9151, "step": 369000 }, { "epoch": 21.588582792302745, "grad_norm": 0.2912254333496094, "learning_rate": 0.001, "loss": 1.912, "step": 369100 }, { "epoch": 21.594431771655845, "grad_norm": 0.23016992211341858, "learning_rate": 0.001, "loss": 1.9123, "step": 369200 }, { "epoch": 21.60028075100895, "grad_norm": 0.2375362366437912, "learning_rate": 0.001, "loss": 1.9121, "step": 369300 }, { "epoch": 21.606129730362053, "grad_norm": 0.23857763409614563, "learning_rate": 0.001, "loss": 1.9199, "step": 369400 }, { "epoch": 21.611978709715153, "grad_norm": 0.20427608489990234, "learning_rate": 0.001, "loss": 1.9154, "step": 369500 }, { "epoch": 21.617827689068257, "grad_norm": 0.20571587979793549, "learning_rate": 0.001, "loss": 1.9081, "step": 369600 }, { "epoch": 21.62367666842136, "grad_norm": 0.23407864570617676, "learning_rate": 0.001, "loss": 1.9077, "step": 369700 }, { "epoch": 21.629525647774464, "grad_norm": 0.2590106725692749, "learning_rate": 0.001, "loss": 1.9111, "step": 369800 }, { "epoch": 21.635374627127565, "grad_norm": 0.23235668241977692, "learning_rate": 0.001, "loss": 1.9204, "step": 369900 }, { "epoch": 21.64122360648067, "grad_norm": 0.20657993853092194, "learning_rate": 0.001, "loss": 1.9099, "step": 370000 }, { "epoch": 21.647072585833772, "grad_norm": 0.2736148238182068, "learning_rate": 0.001, "loss": 1.9134, "step": 370100 }, { "epoch": 21.652921565186876, "grad_norm": 0.25317510962486267, "learning_rate": 0.001, "loss": 1.9143, "step": 370200 }, { "epoch": 21.658770544539976, "grad_norm": 0.38344913721084595, "learning_rate": 0.001, "loss": 1.9152, "step": 370300 }, { "epoch": 21.66461952389308, "grad_norm": 0.26002684235572815, "learning_rate": 0.001, "loss": 1.9118, "step": 370400 }, { "epoch": 21.670468503246184, "grad_norm": 0.2600996196269989, "learning_rate": 0.001, "loss": 1.9117, "step": 370500 }, { "epoch": 21.676317482599288, "grad_norm": 0.2168278992176056, "learning_rate": 0.001, "loss": 1.9148, "step": 370600 }, { "epoch": 21.682166461952388, "grad_norm": 0.23482313752174377, "learning_rate": 0.001, "loss": 1.913, "step": 370700 }, { "epoch": 21.68801544130549, "grad_norm": 0.30047640204429626, "learning_rate": 0.001, "loss": 1.9175, "step": 370800 }, { "epoch": 21.693864420658596, "grad_norm": 0.2053273767232895, "learning_rate": 0.001, "loss": 1.9126, "step": 370900 }, { "epoch": 21.6997134000117, "grad_norm": 0.21166393160820007, "learning_rate": 0.001, "loss": 1.9214, "step": 371000 }, { "epoch": 21.7055623793648, "grad_norm": 0.25149911642074585, "learning_rate": 0.001, "loss": 1.9174, "step": 371100 }, { "epoch": 21.711411358717903, "grad_norm": 0.23925843834877014, "learning_rate": 0.001, "loss": 1.9161, "step": 371200 }, { "epoch": 21.717260338071007, "grad_norm": 0.273587167263031, "learning_rate": 0.001, "loss": 1.9116, "step": 371300 }, { "epoch": 21.72310931742411, "grad_norm": 0.2890111804008484, "learning_rate": 0.001, "loss": 1.9145, "step": 371400 }, { "epoch": 21.72895829677721, "grad_norm": 0.2392122745513916, "learning_rate": 0.001, "loss": 1.9162, "step": 371500 }, { "epoch": 21.734807276130315, "grad_norm": 0.27084946632385254, "learning_rate": 0.001, "loss": 1.9189, "step": 371600 }, { "epoch": 21.74065625548342, "grad_norm": 0.24086949229240417, "learning_rate": 0.001, "loss": 1.9113, "step": 371700 }, { "epoch": 21.746505234836523, "grad_norm": 0.2803862988948822, "learning_rate": 0.001, "loss": 1.9147, "step": 371800 }, { "epoch": 21.752354214189623, "grad_norm": 0.23098699748516083, "learning_rate": 0.001, "loss": 1.9182, "step": 371900 }, { "epoch": 21.758203193542727, "grad_norm": 0.22637398540973663, "learning_rate": 0.001, "loss": 1.9124, "step": 372000 }, { "epoch": 21.76405217289583, "grad_norm": 0.22712182998657227, "learning_rate": 0.001, "loss": 1.9102, "step": 372100 }, { "epoch": 21.769901152248934, "grad_norm": 0.28336894512176514, "learning_rate": 0.001, "loss": 1.914, "step": 372200 }, { "epoch": 21.775750131602035, "grad_norm": 0.2820446789264679, "learning_rate": 0.001, "loss": 1.9152, "step": 372300 }, { "epoch": 21.78159911095514, "grad_norm": 0.25021979212760925, "learning_rate": 0.001, "loss": 1.9157, "step": 372400 }, { "epoch": 21.787448090308242, "grad_norm": 0.23017366230487823, "learning_rate": 0.001, "loss": 1.9173, "step": 372500 }, { "epoch": 21.793297069661342, "grad_norm": 0.2745993435382843, "learning_rate": 0.001, "loss": 1.9124, "step": 372600 }, { "epoch": 21.799146049014446, "grad_norm": 0.20697854459285736, "learning_rate": 0.001, "loss": 1.9094, "step": 372700 }, { "epoch": 21.80499502836755, "grad_norm": 0.30323466658592224, "learning_rate": 0.001, "loss": 1.9107, "step": 372800 }, { "epoch": 21.810844007720654, "grad_norm": 0.3277035653591156, "learning_rate": 0.001, "loss": 1.916, "step": 372900 }, { "epoch": 21.816692987073754, "grad_norm": 0.2722575068473816, "learning_rate": 0.001, "loss": 1.9201, "step": 373000 }, { "epoch": 21.822541966426858, "grad_norm": 0.24068821966648102, "learning_rate": 0.001, "loss": 1.9118, "step": 373100 }, { "epoch": 21.82839094577996, "grad_norm": 0.24095003306865692, "learning_rate": 0.001, "loss": 1.9163, "step": 373200 }, { "epoch": 21.834239925133065, "grad_norm": 0.2751547694206238, "learning_rate": 0.001, "loss": 1.9138, "step": 373300 }, { "epoch": 21.840088904486166, "grad_norm": 0.23474065959453583, "learning_rate": 0.001, "loss": 1.9166, "step": 373400 }, { "epoch": 21.84593788383927, "grad_norm": 0.17100371420383453, "learning_rate": 0.001, "loss": 1.9076, "step": 373500 }, { "epoch": 21.851786863192373, "grad_norm": 0.2640243470668793, "learning_rate": 0.001, "loss": 1.914, "step": 373600 }, { "epoch": 21.857635842545477, "grad_norm": 0.22356024384498596, "learning_rate": 0.001, "loss": 1.9151, "step": 373700 }, { "epoch": 21.863484821898577, "grad_norm": 0.23938141763210297, "learning_rate": 0.001, "loss": 1.9141, "step": 373800 }, { "epoch": 21.86933380125168, "grad_norm": 0.2097918540239334, "learning_rate": 0.001, "loss": 1.9175, "step": 373900 }, { "epoch": 21.875182780604785, "grad_norm": 0.23801058530807495, "learning_rate": 0.001, "loss": 1.9127, "step": 374000 }, { "epoch": 21.88103175995789, "grad_norm": 0.29907482862472534, "learning_rate": 0.001, "loss": 1.915, "step": 374100 }, { "epoch": 21.88688073931099, "grad_norm": 0.19141387939453125, "learning_rate": 0.001, "loss": 1.9161, "step": 374200 }, { "epoch": 21.892729718664093, "grad_norm": 0.28387051820755005, "learning_rate": 0.001, "loss": 1.9163, "step": 374300 }, { "epoch": 21.898578698017197, "grad_norm": 0.26506495475769043, "learning_rate": 0.001, "loss": 1.9129, "step": 374400 }, { "epoch": 21.9044276773703, "grad_norm": 0.21841676533222198, "learning_rate": 0.001, "loss": 1.9122, "step": 374500 }, { "epoch": 21.9102766567234, "grad_norm": 0.2571428716182709, "learning_rate": 0.001, "loss": 1.9186, "step": 374600 }, { "epoch": 21.916125636076504, "grad_norm": 0.27478718757629395, "learning_rate": 0.001, "loss": 1.917, "step": 374700 }, { "epoch": 21.92197461542961, "grad_norm": 0.28566059470176697, "learning_rate": 0.001, "loss": 1.9209, "step": 374800 }, { "epoch": 21.927823594782712, "grad_norm": 0.20145650207996368, "learning_rate": 0.001, "loss": 1.9102, "step": 374900 }, { "epoch": 21.933672574135812, "grad_norm": 0.3138509690761566, "learning_rate": 0.001, "loss": 1.9099, "step": 375000 }, { "epoch": 21.933672574135812, "eval_ag_news_accuracy": 0.23784375, "eval_ag_news_bleu_score": 6.538768959738783, "eval_ag_news_bleu_score_sem": 0.4396058774410828, "eval_ag_news_emb_cos_sim": 0.6930217742919922, "eval_ag_news_emb_cos_sim_sem": 0.014482175931334496, "eval_ag_news_emb_top1_equal": 0.9296875, "eval_ag_news_emb_top1_equal_sem": 0.022687306627631187, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7293338775634766, "eval_ag_news_n_ngrams_match_1": 13.53125, "eval_ag_news_n_ngrams_match_2": 3.7421875, "eval_ag_news_n_ngrams_match_3": 1.3984375, "eval_ag_news_num_pred_words": 44.9921875, "eval_ag_news_num_true_words": 44.5234375, "eval_ag_news_perplexity": 15.322676840838774, "eval_ag_news_pred_num_tokens": 71.6640625, "eval_ag_news_rouge_score": 0.2902903378563164, "eval_ag_news_runtime": 36.3981, "eval_ag_news_samples_per_second": 13.737, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3264629008083208, "eval_ag_news_token_set_f1_sem": 0.009743726614476542, "eval_ag_news_token_set_precision": 0.30435177285106196, "eval_ag_news_token_set_recall": 0.36739504387740274, "eval_ag_news_true_num_tokens": 62.6015625, "step": 375000 }, { "epoch": 21.933672574135812, "eval_anthropic_toxic_prompts_accuracy": 0.102796875, "eval_anthropic_toxic_prompts_bleu_score": 40.62061962263306, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.8193015578351885, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8927826881408691, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009504914283752441, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.15625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03221922171672519, "eval_anthropic_toxic_prompts_loss": 1.288428544998169, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.1484375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.4453125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.421875, "eval_anthropic_toxic_prompts_num_pred_words": 15.2265625, "eval_anthropic_toxic_prompts_num_true_words": 14.7734375, "eval_anthropic_toxic_prompts_perplexity": 3.627082278330709, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.625, "eval_anthropic_toxic_prompts_rouge_score": 0.67139339373795, "eval_anthropic_toxic_prompts_runtime": 29.2236, "eval_anthropic_toxic_prompts_samples_per_second": 17.109, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6945419081600953, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018972173316810885, "eval_anthropic_toxic_prompts_token_set_precision": 0.6944803870961718, "eval_anthropic_toxic_prompts_token_set_recall": 0.6994042377403842, "eval_anthropic_toxic_prompts_true_num_tokens": 18.1953125, "step": 375000 }, { "epoch": 21.933672574135812, "eval_arxiv_accuracy": 0.36875, "eval_arxiv_bleu_score": 1.558215249335794, "eval_arxiv_bleu_score_sem": 0.13977441730407827, "eval_arxiv_emb_cos_sim": 0.4881328344345093, "eval_arxiv_emb_cos_sim_sem": 0.018796345219016075, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.479440450668335, "eval_arxiv_n_ngrams_match_1": 12.625, "eval_arxiv_n_ngrams_match_2": 2.2734375, "eval_arxiv_n_ngrams_match_3": 0.4609375, "eval_arxiv_num_pred_words": 51.96875, "eval_arxiv_num_true_words": 85.234375, "eval_arxiv_perplexity": 32.441564340618285, "eval_arxiv_pred_num_tokens": 125.6953125, "eval_arxiv_rouge_score": 0.16511412630222766, "eval_arxiv_runtime": 30.4575, "eval_arxiv_samples_per_second": 16.416, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.17313784023582543, "eval_arxiv_token_set_f1_sem": 0.00841211227112951, "eval_arxiv_token_set_precision": 0.11565808175312542, "eval_arxiv_token_set_recall": 0.41579272912480064, "eval_arxiv_true_num_tokens": 123.984375, "step": 375000 }, { "epoch": 21.933672574135812, "eval_python_code_alpaca_accuracy": 0.12990625, "eval_python_code_alpaca_bleu_score": 26.544394863293956, "eval_python_code_alpaca_bleu_score_sem": 1.6386780365520548, "eval_python_code_alpaca_emb_cos_sim": 0.8599969148635864, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009667730890214443, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5585647821426392, "eval_python_code_alpaca_n_ngrams_match_1": 10.3828125, "eval_python_code_alpaca_n_ngrams_match_2": 5.5625, "eval_python_code_alpaca_n_ngrams_match_3": 3.15625, "eval_python_code_alpaca_num_pred_words": 18.046875, "eval_python_code_alpaca_num_true_words": 18.7109375, "eval_python_code_alpaca_perplexity": 4.751996198792588, "eval_python_code_alpaca_pred_num_tokens": 24.015625, "eval_python_code_alpaca_rouge_score": 0.5833860471407497, "eval_python_code_alpaca_runtime": 28.9311, "eval_python_code_alpaca_samples_per_second": 17.282, "eval_python_code_alpaca_steps_per_second": 0.035, "eval_python_code_alpaca_token_set_f1": 0.6069715683611909, "eval_python_code_alpaca_token_set_f1_sem": 0.013668530335580673, "eval_python_code_alpaca_token_set_precision": 0.5954700452308801, "eval_python_code_alpaca_token_set_recall": 0.6270893898957067, "eval_python_code_alpaca_true_num_tokens": 24.2421875, "step": 375000 }, { "epoch": 21.933672574135812, "eval_wikibio_accuracy": 0.365140625, "eval_wikibio_bleu_score": 6.75957404411535, "eval_wikibio_bleu_score_sem": 0.5654519335800827, "eval_wikibio_emb_cos_sim": 0.6229758262634277, "eval_wikibio_emb_cos_sim_sem": 0.020657341927289963, "eval_wikibio_emb_top1_equal": 0.96875, "eval_wikibio_emb_top1_equal_sem": 0.01543935015797615, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.747328996658325, "eval_wikibio_n_ngrams_match_1": 15.9296875, "eval_wikibio_n_ngrams_match_2": 5.140625, "eval_wikibio_n_ngrams_match_3": 2.0078125, "eval_wikibio_num_pred_words": 56.1640625, "eval_wikibio_num_true_words": 53.234375, "eval_wikibio_perplexity": 15.600906111798528, "eval_wikibio_pred_num_tokens": 109.296875, "eval_wikibio_rouge_score": 0.2944711489104092, "eval_wikibio_runtime": 30.6515, "eval_wikibio_samples_per_second": 16.312, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.3220096126295571, "eval_wikibio_token_set_f1_sem": 0.011321925474787931, "eval_wikibio_token_set_precision": 0.2874212002009638, "eval_wikibio_token_set_recall": 0.39983371222146835, "eval_wikibio_true_num_tokens": 102.1875, "step": 375000 }, { "epoch": 21.933672574135812, "eval_msmarco_accuracy": 0.384515625, "eval_msmarco_bleu_score": 14.375793174858588, "eval_msmarco_bleu_score_sem": 1.0653568358465415, "eval_msmarco_emb_cos_sim": 0.7657182216644287, "eval_msmarco_emb_cos_sim_sem": 0.016846507787704468, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7635595798492432, "eval_msmarco_n_ngrams_match_1": 26.6484375, "eval_msmarco_n_ngrams_match_2": 11.140625, "eval_msmarco_n_ngrams_match_3": 5.8671875, "eval_msmarco_num_pred_words": 60.0390625, "eval_msmarco_num_true_words": 60.84375, "eval_msmarco_perplexity": 5.833164096717653, "eval_msmarco_pred_num_tokens": 84.21875, "eval_msmarco_rouge_score": 0.41895545624181896, "eval_msmarco_runtime": 25.2934, "eval_msmarco_samples_per_second": 19.768, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.4472950803068212, "eval_msmarco_token_set_f1_sem": 0.012291063110755661, "eval_msmarco_token_set_precision": 0.4082818966803654, "eval_msmarco_token_set_recall": 0.5144812369741186, "eval_msmarco_true_num_tokens": 78.953125, "step": 375000 }, { "epoch": 21.939521553488916, "grad_norm": 0.2555379867553711, "learning_rate": 0.001, "loss": 1.9087, "step": 375100 }, { "epoch": 21.94537053284202, "grad_norm": 0.23577946424484253, "learning_rate": 0.001, "loss": 1.9132, "step": 375200 }, { "epoch": 21.951219512195124, "grad_norm": 0.2068188339471817, "learning_rate": 0.001, "loss": 1.9186, "step": 375300 }, { "epoch": 21.957068491548224, "grad_norm": 0.2565460205078125, "learning_rate": 0.001, "loss": 1.9122, "step": 375400 }, { "epoch": 21.962917470901328, "grad_norm": 0.24698257446289062, "learning_rate": 0.001, "loss": 1.9125, "step": 375500 }, { "epoch": 21.96876645025443, "grad_norm": 0.2063770890235901, "learning_rate": 0.001, "loss": 1.9175, "step": 375600 }, { "epoch": 21.974615429607532, "grad_norm": 0.22871479392051697, "learning_rate": 0.001, "loss": 1.9145, "step": 375700 }, { "epoch": 21.980464408960636, "grad_norm": 0.29342353343963623, "learning_rate": 0.001, "loss": 1.9138, "step": 375800 }, { "epoch": 21.98631338831374, "grad_norm": 0.2444133758544922, "learning_rate": 0.001, "loss": 1.9106, "step": 375900 }, { "epoch": 21.992162367666843, "grad_norm": 0.2053886353969574, "learning_rate": 0.001, "loss": 1.917, "step": 376000 }, { "epoch": 21.998011347019943, "grad_norm": 0.3157656490802765, "learning_rate": 0.001, "loss": 1.9137, "step": 376100 }, { "epoch": 22.003860326373047, "grad_norm": 0.21664872765541077, "learning_rate": 0.001, "loss": 1.9089, "step": 376200 }, { "epoch": 22.00970930572615, "grad_norm": 0.18933512270450592, "learning_rate": 0.001, "loss": 1.9026, "step": 376300 }, { "epoch": 22.015558285079255, "grad_norm": 0.156101256608963, "learning_rate": 0.001, "loss": 1.8939, "step": 376400 }, { "epoch": 22.021407264432355, "grad_norm": 0.1745503842830658, "learning_rate": 0.001, "loss": 1.8948, "step": 376500 }, { "epoch": 22.02725624378546, "grad_norm": 0.21438059210777283, "learning_rate": 0.001, "loss": 1.8992, "step": 376600 }, { "epoch": 22.033105223138563, "grad_norm": 0.24229244887828827, "learning_rate": 0.001, "loss": 1.8938, "step": 376700 }, { "epoch": 22.038954202491666, "grad_norm": 0.2196320742368698, "learning_rate": 0.001, "loss": 1.9018, "step": 376800 }, { "epoch": 22.044803181844767, "grad_norm": 0.21859747171401978, "learning_rate": 0.001, "loss": 1.8972, "step": 376900 }, { "epoch": 22.05065216119787, "grad_norm": 0.30129316449165344, "learning_rate": 0.001, "loss": 1.9054, "step": 377000 }, { "epoch": 22.056501140550974, "grad_norm": 0.2598526179790497, "learning_rate": 0.001, "loss": 1.8969, "step": 377100 }, { "epoch": 22.062350119904078, "grad_norm": 0.25108930468559265, "learning_rate": 0.001, "loss": 1.9016, "step": 377200 }, { "epoch": 22.06819909925718, "grad_norm": 0.22692793607711792, "learning_rate": 0.001, "loss": 1.9048, "step": 377300 }, { "epoch": 22.074048078610282, "grad_norm": 0.25089773535728455, "learning_rate": 0.001, "loss": 1.9078, "step": 377400 }, { "epoch": 22.079897057963386, "grad_norm": 0.2079874724149704, "learning_rate": 0.001, "loss": 1.8993, "step": 377500 }, { "epoch": 22.08574603731649, "grad_norm": 0.1876765787601471, "learning_rate": 0.001, "loss": 1.8986, "step": 377600 }, { "epoch": 22.09159501666959, "grad_norm": 0.18787896633148193, "learning_rate": 0.001, "loss": 1.8961, "step": 377700 }, { "epoch": 22.097443996022694, "grad_norm": 0.19193170964717865, "learning_rate": 0.001, "loss": 1.9028, "step": 377800 }, { "epoch": 22.103292975375798, "grad_norm": 0.30953606963157654, "learning_rate": 0.001, "loss": 1.9027, "step": 377900 }, { "epoch": 22.1091419547289, "grad_norm": 0.16126681864261627, "learning_rate": 0.001, "loss": 1.9011, "step": 378000 }, { "epoch": 22.114990934082, "grad_norm": 0.18537548184394836, "learning_rate": 0.001, "loss": 1.8985, "step": 378100 }, { "epoch": 22.120839913435105, "grad_norm": 0.20151297748088837, "learning_rate": 0.001, "loss": 1.8995, "step": 378200 }, { "epoch": 22.12668889278821, "grad_norm": 0.20318040251731873, "learning_rate": 0.001, "loss": 1.9035, "step": 378300 }, { "epoch": 22.132537872141313, "grad_norm": 0.19472740590572357, "learning_rate": 0.001, "loss": 1.9011, "step": 378400 }, { "epoch": 22.138386851494413, "grad_norm": 0.22014375030994415, "learning_rate": 0.001, "loss": 1.9021, "step": 378500 }, { "epoch": 22.144235830847517, "grad_norm": 0.20938684046268463, "learning_rate": 0.001, "loss": 1.9072, "step": 378600 }, { "epoch": 22.15008481020062, "grad_norm": 0.1587643027305603, "learning_rate": 0.001, "loss": 1.9086, "step": 378700 }, { "epoch": 22.15593378955372, "grad_norm": 0.20075683295726776, "learning_rate": 0.001, "loss": 1.8992, "step": 378800 }, { "epoch": 22.161782768906825, "grad_norm": 0.23389089107513428, "learning_rate": 0.001, "loss": 1.901, "step": 378900 }, { "epoch": 22.16763174825993, "grad_norm": 0.28719592094421387, "learning_rate": 0.001, "loss": 1.9013, "step": 379000 }, { "epoch": 22.173480727613033, "grad_norm": 0.2872694730758667, "learning_rate": 0.001, "loss": 1.8987, "step": 379100 }, { "epoch": 22.179329706966133, "grad_norm": 0.25307491421699524, "learning_rate": 0.001, "loss": 1.9039, "step": 379200 }, { "epoch": 22.185178686319237, "grad_norm": 0.21768462657928467, "learning_rate": 0.001, "loss": 1.9048, "step": 379300 }, { "epoch": 22.19102766567234, "grad_norm": 0.1904667317867279, "learning_rate": 0.001, "loss": 1.9029, "step": 379400 }, { "epoch": 22.196876645025444, "grad_norm": 0.2112894505262375, "learning_rate": 0.001, "loss": 1.9007, "step": 379500 }, { "epoch": 22.202725624378544, "grad_norm": 0.17904295027256012, "learning_rate": 0.001, "loss": 1.9014, "step": 379600 }, { "epoch": 22.20857460373165, "grad_norm": 0.21239899098873138, "learning_rate": 0.001, "loss": 1.8991, "step": 379700 }, { "epoch": 22.214423583084752, "grad_norm": 0.20088079571723938, "learning_rate": 0.001, "loss": 1.8979, "step": 379800 }, { "epoch": 22.220272562437856, "grad_norm": 0.2373102307319641, "learning_rate": 0.001, "loss": 1.8962, "step": 379900 }, { "epoch": 22.226121541790956, "grad_norm": 0.19922736287117004, "learning_rate": 0.001, "loss": 1.9085, "step": 380000 }, { "epoch": 22.23197052114406, "grad_norm": 0.21146702766418457, "learning_rate": 0.001, "loss": 1.9062, "step": 380100 }, { "epoch": 22.237819500497164, "grad_norm": 0.20337392389774323, "learning_rate": 0.001, "loss": 1.9059, "step": 380200 }, { "epoch": 22.243668479850268, "grad_norm": 0.22370576858520508, "learning_rate": 0.001, "loss": 1.9075, "step": 380300 }, { "epoch": 22.249517459203368, "grad_norm": 0.228904128074646, "learning_rate": 0.001, "loss": 1.9079, "step": 380400 }, { "epoch": 22.25536643855647, "grad_norm": 0.29161956906318665, "learning_rate": 0.001, "loss": 1.9093, "step": 380500 }, { "epoch": 22.261215417909575, "grad_norm": 0.1847153604030609, "learning_rate": 0.001, "loss": 1.9051, "step": 380600 }, { "epoch": 22.26706439726268, "grad_norm": 0.21187558770179749, "learning_rate": 0.001, "loss": 1.908, "step": 380700 }, { "epoch": 22.27291337661578, "grad_norm": 0.20663149654865265, "learning_rate": 0.001, "loss": 1.9069, "step": 380800 }, { "epoch": 22.278762355968883, "grad_norm": 0.18009062111377716, "learning_rate": 0.001, "loss": 1.9015, "step": 380900 }, { "epoch": 22.284611335321987, "grad_norm": 0.216348335146904, "learning_rate": 0.001, "loss": 1.9083, "step": 381000 }, { "epoch": 22.29046031467509, "grad_norm": 0.2518792152404785, "learning_rate": 0.001, "loss": 1.8986, "step": 381100 }, { "epoch": 22.29630929402819, "grad_norm": 0.24878495931625366, "learning_rate": 0.001, "loss": 1.9048, "step": 381200 }, { "epoch": 22.302158273381295, "grad_norm": 0.20778927206993103, "learning_rate": 0.001, "loss": 1.896, "step": 381300 }, { "epoch": 22.3080072527344, "grad_norm": 0.24839849770069122, "learning_rate": 0.001, "loss": 1.906, "step": 381400 }, { "epoch": 22.313856232087502, "grad_norm": 0.2706730365753174, "learning_rate": 0.001, "loss": 1.9056, "step": 381500 }, { "epoch": 22.319705211440603, "grad_norm": 0.24171854555606842, "learning_rate": 0.001, "loss": 1.9044, "step": 381600 }, { "epoch": 22.325554190793707, "grad_norm": 0.19026009738445282, "learning_rate": 0.001, "loss": 1.9027, "step": 381700 }, { "epoch": 22.33140317014681, "grad_norm": 0.2128635048866272, "learning_rate": 0.001, "loss": 1.9036, "step": 381800 }, { "epoch": 22.33725214949991, "grad_norm": 0.20740240812301636, "learning_rate": 0.001, "loss": 1.9048, "step": 381900 }, { "epoch": 22.343101128853014, "grad_norm": 0.21688716113567352, "learning_rate": 0.001, "loss": 1.8993, "step": 382000 }, { "epoch": 22.348950108206118, "grad_norm": 0.24069392681121826, "learning_rate": 0.001, "loss": 1.9066, "step": 382100 }, { "epoch": 22.354799087559222, "grad_norm": 0.26731961965560913, "learning_rate": 0.001, "loss": 1.909, "step": 382200 }, { "epoch": 22.360648066912322, "grad_norm": 0.24388617277145386, "learning_rate": 0.001, "loss": 1.9081, "step": 382300 }, { "epoch": 22.366497046265426, "grad_norm": 0.19185982644557953, "learning_rate": 0.001, "loss": 1.91, "step": 382400 }, { "epoch": 22.37234602561853, "grad_norm": 0.29974564909935, "learning_rate": 0.001, "loss": 1.9095, "step": 382500 }, { "epoch": 22.378195004971634, "grad_norm": 0.1902867704629898, "learning_rate": 0.001, "loss": 1.908, "step": 382600 }, { "epoch": 22.384043984324734, "grad_norm": 0.19500668346881866, "learning_rate": 0.001, "loss": 1.9018, "step": 382700 }, { "epoch": 22.389892963677838, "grad_norm": 0.28461554646492004, "learning_rate": 0.001, "loss": 1.9066, "step": 382800 }, { "epoch": 22.39574194303094, "grad_norm": 0.1866457611322403, "learning_rate": 0.001, "loss": 1.9038, "step": 382900 }, { "epoch": 22.401590922384045, "grad_norm": 0.2246304750442505, "learning_rate": 0.001, "loss": 1.908, "step": 383000 }, { "epoch": 22.407439901737146, "grad_norm": 0.1668243110179901, "learning_rate": 0.001, "loss": 1.9045, "step": 383100 }, { "epoch": 22.41328888109025, "grad_norm": 0.17437897622585297, "learning_rate": 0.001, "loss": 1.9037, "step": 383200 }, { "epoch": 22.419137860443353, "grad_norm": 0.2247334122657776, "learning_rate": 0.001, "loss": 1.9083, "step": 383300 }, { "epoch": 22.424986839796457, "grad_norm": 0.22595365345478058, "learning_rate": 0.001, "loss": 1.9016, "step": 383400 }, { "epoch": 22.430835819149557, "grad_norm": 0.23173782229423523, "learning_rate": 0.001, "loss": 1.9027, "step": 383500 }, { "epoch": 22.43668479850266, "grad_norm": 0.25577759742736816, "learning_rate": 0.001, "loss": 1.9028, "step": 383600 }, { "epoch": 22.442533777855765, "grad_norm": 0.21170583367347717, "learning_rate": 0.001, "loss": 1.9085, "step": 383700 }, { "epoch": 22.44838275720887, "grad_norm": 0.20879775285720825, "learning_rate": 0.001, "loss": 1.9074, "step": 383800 }, { "epoch": 22.45423173656197, "grad_norm": 0.23381675779819489, "learning_rate": 0.001, "loss": 1.9075, "step": 383900 }, { "epoch": 22.460080715915073, "grad_norm": 0.20465633273124695, "learning_rate": 0.001, "loss": 1.9144, "step": 384000 }, { "epoch": 22.465929695268176, "grad_norm": 0.23356151580810547, "learning_rate": 0.001, "loss": 1.9048, "step": 384100 }, { "epoch": 22.47177867462128, "grad_norm": 0.26018500328063965, "learning_rate": 0.001, "loss": 1.8991, "step": 384200 }, { "epoch": 22.47762765397438, "grad_norm": 0.20699115097522736, "learning_rate": 0.001, "loss": 1.903, "step": 384300 }, { "epoch": 22.483476633327484, "grad_norm": 0.2690514028072357, "learning_rate": 0.001, "loss": 1.9081, "step": 384400 }, { "epoch": 22.489325612680588, "grad_norm": 0.19480620324611664, "learning_rate": 0.001, "loss": 1.9071, "step": 384500 }, { "epoch": 22.495174592033692, "grad_norm": 0.19392773509025574, "learning_rate": 0.001, "loss": 1.9015, "step": 384600 }, { "epoch": 22.501023571386792, "grad_norm": 0.2217402160167694, "learning_rate": 0.001, "loss": 1.9065, "step": 384700 }, { "epoch": 22.506872550739896, "grad_norm": 0.21958491206169128, "learning_rate": 0.001, "loss": 1.9088, "step": 384800 }, { "epoch": 22.512721530093, "grad_norm": 0.2201777547597885, "learning_rate": 0.001, "loss": 1.9106, "step": 384900 }, { "epoch": 22.5185705094461, "grad_norm": 0.2574028968811035, "learning_rate": 0.001, "loss": 1.9077, "step": 385000 }, { "epoch": 22.524419488799204, "grad_norm": 0.19753995537757874, "learning_rate": 0.001, "loss": 1.9059, "step": 385100 }, { "epoch": 22.530268468152308, "grad_norm": 0.23008470237255096, "learning_rate": 0.001, "loss": 1.9075, "step": 385200 }, { "epoch": 22.53611744750541, "grad_norm": 0.19308026134967804, "learning_rate": 0.001, "loss": 1.9132, "step": 385300 }, { "epoch": 22.54196642685851, "grad_norm": 0.26451680064201355, "learning_rate": 0.001, "loss": 1.9126, "step": 385400 }, { "epoch": 22.547815406211615, "grad_norm": 0.24847029149532318, "learning_rate": 0.001, "loss": 1.9098, "step": 385500 }, { "epoch": 22.55366438556472, "grad_norm": 0.1953101009130478, "learning_rate": 0.001, "loss": 1.9079, "step": 385600 }, { "epoch": 22.559513364917823, "grad_norm": 0.2009211778640747, "learning_rate": 0.001, "loss": 1.9067, "step": 385700 }, { "epoch": 22.565362344270923, "grad_norm": 0.19789651036262512, "learning_rate": 0.001, "loss": 1.9057, "step": 385800 }, { "epoch": 22.571211323624027, "grad_norm": 0.21872486174106598, "learning_rate": 0.001, "loss": 1.9126, "step": 385900 }, { "epoch": 22.57706030297713, "grad_norm": 0.21002191305160522, "learning_rate": 0.001, "loss": 1.9052, "step": 386000 }, { "epoch": 22.582909282330235, "grad_norm": 0.23583196103572845, "learning_rate": 0.001, "loss": 1.908, "step": 386100 }, { "epoch": 22.588758261683335, "grad_norm": 0.18684022128582, "learning_rate": 0.001, "loss": 1.9135, "step": 386200 }, { "epoch": 22.59460724103644, "grad_norm": 0.20112057030200958, "learning_rate": 0.001, "loss": 1.9038, "step": 386300 }, { "epoch": 22.600456220389542, "grad_norm": 0.18756237626075745, "learning_rate": 0.001, "loss": 1.9068, "step": 386400 }, { "epoch": 22.606305199742646, "grad_norm": 0.2159426361322403, "learning_rate": 0.001, "loss": 1.9079, "step": 386500 }, { "epoch": 22.612154179095747, "grad_norm": 0.2000078707933426, "learning_rate": 0.001, "loss": 1.9142, "step": 386600 }, { "epoch": 22.61800315844885, "grad_norm": 0.22143302857875824, "learning_rate": 0.001, "loss": 1.903, "step": 386700 }, { "epoch": 22.623852137801954, "grad_norm": 0.17409305274486542, "learning_rate": 0.001, "loss": 1.9056, "step": 386800 }, { "epoch": 22.629701117155058, "grad_norm": 0.2159634232521057, "learning_rate": 0.001, "loss": 1.9089, "step": 386900 }, { "epoch": 22.635550096508158, "grad_norm": 0.18608543276786804, "learning_rate": 0.001, "loss": 1.9092, "step": 387000 }, { "epoch": 22.641399075861262, "grad_norm": 0.1785925030708313, "learning_rate": 0.001, "loss": 1.9077, "step": 387100 }, { "epoch": 22.647248055214366, "grad_norm": 0.20073950290679932, "learning_rate": 0.001, "loss": 1.9036, "step": 387200 }, { "epoch": 22.65309703456747, "grad_norm": 0.2493356317281723, "learning_rate": 0.001, "loss": 1.9144, "step": 387300 }, { "epoch": 22.65894601392057, "grad_norm": 0.2220444679260254, "learning_rate": 0.001, "loss": 1.9092, "step": 387400 }, { "epoch": 22.664794993273674, "grad_norm": 0.299367219209671, "learning_rate": 0.001, "loss": 1.9091, "step": 387500 }, { "epoch": 22.670643972626777, "grad_norm": 0.18889658153057098, "learning_rate": 0.001, "loss": 1.9232, "step": 387600 }, { "epoch": 22.67649295197988, "grad_norm": 0.2320055067539215, "learning_rate": 0.001, "loss": 1.9076, "step": 387700 }, { "epoch": 22.68234193133298, "grad_norm": 0.23311103880405426, "learning_rate": 0.001, "loss": 1.9113, "step": 387800 }, { "epoch": 22.688190910686085, "grad_norm": 0.1842062920331955, "learning_rate": 0.001, "loss": 1.9073, "step": 387900 }, { "epoch": 22.69403989003919, "grad_norm": 0.19879025220870972, "learning_rate": 0.001, "loss": 1.9018, "step": 388000 }, { "epoch": 22.69988886939229, "grad_norm": 0.29100099205970764, "learning_rate": 0.001, "loss": 1.9129, "step": 388100 }, { "epoch": 22.705737848745393, "grad_norm": 0.19816288352012634, "learning_rate": 0.001, "loss": 1.9115, "step": 388200 }, { "epoch": 22.711586828098497, "grad_norm": 0.17971521615982056, "learning_rate": 0.001, "loss": 1.9033, "step": 388300 }, { "epoch": 22.7174358074516, "grad_norm": 0.24362631142139435, "learning_rate": 0.001, "loss": 1.916, "step": 388400 }, { "epoch": 22.7232847868047, "grad_norm": 0.20249490439891815, "learning_rate": 0.001, "loss": 1.9062, "step": 388500 }, { "epoch": 22.729133766157805, "grad_norm": 0.23603717982769012, "learning_rate": 0.001, "loss": 1.9066, "step": 388600 }, { "epoch": 22.73498274551091, "grad_norm": 0.2274623066186905, "learning_rate": 0.001, "loss": 1.9103, "step": 388700 }, { "epoch": 22.740831724864012, "grad_norm": 0.22768503427505493, "learning_rate": 0.001, "loss": 1.906, "step": 388800 }, { "epoch": 22.746680704217113, "grad_norm": 0.25921592116355896, "learning_rate": 0.001, "loss": 1.9084, "step": 388900 }, { "epoch": 22.752529683570216, "grad_norm": 0.20024649798870087, "learning_rate": 0.001, "loss": 1.908, "step": 389000 }, { "epoch": 22.75837866292332, "grad_norm": 0.2473999559879303, "learning_rate": 0.001, "loss": 1.9127, "step": 389100 }, { "epoch": 22.764227642276424, "grad_norm": 0.22206667065620422, "learning_rate": 0.001, "loss": 1.91, "step": 389200 }, { "epoch": 22.770076621629524, "grad_norm": 0.2556186020374298, "learning_rate": 0.001, "loss": 1.908, "step": 389300 }, { "epoch": 22.775925600982628, "grad_norm": 0.21177875995635986, "learning_rate": 0.001, "loss": 1.9106, "step": 389400 }, { "epoch": 22.781774580335732, "grad_norm": 0.21512410044670105, "learning_rate": 0.001, "loss": 1.9111, "step": 389500 }, { "epoch": 22.787623559688836, "grad_norm": 0.2110896110534668, "learning_rate": 0.001, "loss": 1.9082, "step": 389600 }, { "epoch": 22.793472539041936, "grad_norm": 0.267151802778244, "learning_rate": 0.001, "loss": 1.9063, "step": 389700 }, { "epoch": 22.79932151839504, "grad_norm": 0.1962404102087021, "learning_rate": 0.001, "loss": 1.9177, "step": 389800 }, { "epoch": 22.805170497748144, "grad_norm": 0.2611929178237915, "learning_rate": 0.001, "loss": 1.9115, "step": 389900 }, { "epoch": 22.811019477101247, "grad_norm": 0.23561657965183258, "learning_rate": 0.001, "loss": 1.9093, "step": 390000 }, { "epoch": 22.816868456454348, "grad_norm": 0.2593473494052887, "learning_rate": 0.001, "loss": 1.9097, "step": 390100 }, { "epoch": 22.82271743580745, "grad_norm": 0.2023795247077942, "learning_rate": 0.001, "loss": 1.9158, "step": 390200 }, { "epoch": 22.828566415160555, "grad_norm": 0.22637903690338135, "learning_rate": 0.001, "loss": 1.9109, "step": 390300 }, { "epoch": 22.83441539451366, "grad_norm": 0.18884555995464325, "learning_rate": 0.001, "loss": 1.9118, "step": 390400 }, { "epoch": 22.84026437386676, "grad_norm": 0.20807428658008575, "learning_rate": 0.001, "loss": 1.9123, "step": 390500 }, { "epoch": 22.846113353219863, "grad_norm": 0.2298734188079834, "learning_rate": 0.001, "loss": 1.9087, "step": 390600 }, { "epoch": 22.851962332572967, "grad_norm": 0.221605122089386, "learning_rate": 0.001, "loss": 1.9087, "step": 390700 }, { "epoch": 22.85781131192607, "grad_norm": 0.16772808134555817, "learning_rate": 0.001, "loss": 1.9045, "step": 390800 }, { "epoch": 22.86366029127917, "grad_norm": 0.24891231954097748, "learning_rate": 0.001, "loss": 1.909, "step": 390900 }, { "epoch": 22.869509270632275, "grad_norm": 0.19471332430839539, "learning_rate": 0.001, "loss": 1.901, "step": 391000 }, { "epoch": 22.87535824998538, "grad_norm": 0.1790570616722107, "learning_rate": 0.001, "loss": 1.9126, "step": 391100 }, { "epoch": 22.88120722933848, "grad_norm": 0.19307483732700348, "learning_rate": 0.001, "loss": 1.9075, "step": 391200 }, { "epoch": 22.887056208691583, "grad_norm": 0.196878582239151, "learning_rate": 0.001, "loss": 1.9016, "step": 391300 }, { "epoch": 22.892905188044686, "grad_norm": 0.17998550832271576, "learning_rate": 0.001, "loss": 1.9125, "step": 391400 }, { "epoch": 22.89875416739779, "grad_norm": 0.30340635776519775, "learning_rate": 0.001, "loss": 1.9084, "step": 391500 }, { "epoch": 22.90460314675089, "grad_norm": 0.20671936869621277, "learning_rate": 0.001, "loss": 1.9157, "step": 391600 }, { "epoch": 22.910452126103994, "grad_norm": 0.21919798851013184, "learning_rate": 0.001, "loss": 1.9124, "step": 391700 }, { "epoch": 22.916301105457098, "grad_norm": 0.17689788341522217, "learning_rate": 0.001, "loss": 1.9042, "step": 391800 }, { "epoch": 22.9221500848102, "grad_norm": 0.2033175826072693, "learning_rate": 0.001, "loss": 1.9027, "step": 391900 }, { "epoch": 22.927999064163302, "grad_norm": 0.20414717495441437, "learning_rate": 0.001, "loss": 1.9102, "step": 392000 }, { "epoch": 22.933848043516406, "grad_norm": 0.20024867355823517, "learning_rate": 0.001, "loss": 1.9131, "step": 392100 }, { "epoch": 22.93969702286951, "grad_norm": 0.2328403890132904, "learning_rate": 0.001, "loss": 1.9094, "step": 392200 }, { "epoch": 22.945546002222613, "grad_norm": 0.20032820105552673, "learning_rate": 0.001, "loss": 1.9067, "step": 392300 }, { "epoch": 22.951394981575714, "grad_norm": 0.19128844141960144, "learning_rate": 0.001, "loss": 1.909, "step": 392400 }, { "epoch": 22.957243960928817, "grad_norm": 0.2947065532207489, "learning_rate": 0.001, "loss": 1.9062, "step": 392500 }, { "epoch": 22.96309294028192, "grad_norm": 0.18457558751106262, "learning_rate": 0.001, "loss": 1.905, "step": 392600 }, { "epoch": 22.968941919635025, "grad_norm": 0.18726752698421478, "learning_rate": 0.001, "loss": 1.9059, "step": 392700 }, { "epoch": 22.974790898988125, "grad_norm": 0.22373360395431519, "learning_rate": 0.001, "loss": 1.9136, "step": 392800 }, { "epoch": 22.98063987834123, "grad_norm": 0.17730332911014557, "learning_rate": 0.001, "loss": 1.9138, "step": 392900 }, { "epoch": 22.986488857694333, "grad_norm": 0.18807505071163177, "learning_rate": 0.001, "loss": 1.9045, "step": 393000 }, { "epoch": 22.992337837047437, "grad_norm": 0.2266053557395935, "learning_rate": 0.001, "loss": 1.9097, "step": 393100 }, { "epoch": 22.998186816400537, "grad_norm": 0.2278028279542923, "learning_rate": 0.001, "loss": 1.9158, "step": 393200 }, { "epoch": 23.00403579575364, "grad_norm": 0.20560866594314575, "learning_rate": 0.001, "loss": 1.8977, "step": 393300 }, { "epoch": 23.009884775106745, "grad_norm": 0.1925390362739563, "learning_rate": 0.001, "loss": 1.8944, "step": 393400 }, { "epoch": 23.01573375445985, "grad_norm": 0.23571544885635376, "learning_rate": 0.001, "loss": 1.8961, "step": 393500 }, { "epoch": 23.02158273381295, "grad_norm": 0.21549449861049652, "learning_rate": 0.001, "loss": 1.8926, "step": 393600 }, { "epoch": 23.027431713166052, "grad_norm": 0.20991183817386627, "learning_rate": 0.001, "loss": 1.8934, "step": 393700 }, { "epoch": 23.033280692519156, "grad_norm": 0.25056329369544983, "learning_rate": 0.001, "loss": 1.8899, "step": 393800 }, { "epoch": 23.03912967187226, "grad_norm": 0.22310560941696167, "learning_rate": 0.001, "loss": 1.899, "step": 393900 }, { "epoch": 23.04497865122536, "grad_norm": 0.2619352638721466, "learning_rate": 0.001, "loss": 1.9001, "step": 394000 }, { "epoch": 23.050827630578464, "grad_norm": 0.20539885759353638, "learning_rate": 0.001, "loss": 1.8984, "step": 394100 }, { "epoch": 23.056676609931568, "grad_norm": 0.1939355731010437, "learning_rate": 0.001, "loss": 1.8981, "step": 394200 }, { "epoch": 23.062525589284668, "grad_norm": 0.2497740238904953, "learning_rate": 0.001, "loss": 1.8937, "step": 394300 }, { "epoch": 23.068374568637772, "grad_norm": 0.20910319685935974, "learning_rate": 0.001, "loss": 1.899, "step": 394400 }, { "epoch": 23.074223547990876, "grad_norm": 0.19780324399471283, "learning_rate": 0.001, "loss": 1.8984, "step": 394500 }, { "epoch": 23.08007252734398, "grad_norm": 0.26027798652648926, "learning_rate": 0.001, "loss": 1.901, "step": 394600 }, { "epoch": 23.08592150669708, "grad_norm": 0.25074130296707153, "learning_rate": 0.001, "loss": 1.8947, "step": 394700 }, { "epoch": 23.091770486050184, "grad_norm": 0.19913330674171448, "learning_rate": 0.001, "loss": 1.899, "step": 394800 }, { "epoch": 23.097619465403287, "grad_norm": 0.2391635924577713, "learning_rate": 0.001, "loss": 1.8956, "step": 394900 }, { "epoch": 23.10346844475639, "grad_norm": 0.19949018955230713, "learning_rate": 0.001, "loss": 1.8926, "step": 395000 }, { "epoch": 23.10931742410949, "grad_norm": 0.2084311842918396, "learning_rate": 0.001, "loss": 1.889, "step": 395100 }, { "epoch": 23.115166403462595, "grad_norm": 0.2155344933271408, "learning_rate": 0.001, "loss": 1.898, "step": 395200 }, { "epoch": 23.1210153828157, "grad_norm": 0.2186867743730545, "learning_rate": 0.001, "loss": 1.8928, "step": 395300 }, { "epoch": 23.126864362168803, "grad_norm": 0.21587295830249786, "learning_rate": 0.001, "loss": 1.897, "step": 395400 }, { "epoch": 23.132713341521903, "grad_norm": 0.2764468789100647, "learning_rate": 0.001, "loss": 1.8974, "step": 395500 }, { "epoch": 23.138562320875007, "grad_norm": 0.19105400145053864, "learning_rate": 0.001, "loss": 1.9015, "step": 395600 }, { "epoch": 23.14441130022811, "grad_norm": 0.2750973403453827, "learning_rate": 0.001, "loss": 1.8999, "step": 395700 }, { "epoch": 23.150260279581214, "grad_norm": 0.19998647272586823, "learning_rate": 0.001, "loss": 1.9023, "step": 395800 }, { "epoch": 23.156109258934315, "grad_norm": 0.1971772164106369, "learning_rate": 0.001, "loss": 1.8959, "step": 395900 }, { "epoch": 23.16195823828742, "grad_norm": 0.17737215757369995, "learning_rate": 0.001, "loss": 1.8956, "step": 396000 }, { "epoch": 23.167807217640522, "grad_norm": 0.24681422114372253, "learning_rate": 0.001, "loss": 1.8965, "step": 396100 }, { "epoch": 23.173656196993626, "grad_norm": 0.21637555956840515, "learning_rate": 0.001, "loss": 1.8995, "step": 396200 }, { "epoch": 23.179505176346726, "grad_norm": 0.24056147038936615, "learning_rate": 0.001, "loss": 1.8978, "step": 396300 }, { "epoch": 23.18535415569983, "grad_norm": 0.17760038375854492, "learning_rate": 0.001, "loss": 1.8999, "step": 396400 }, { "epoch": 23.191203135052934, "grad_norm": 0.2898433804512024, "learning_rate": 0.001, "loss": 1.8982, "step": 396500 }, { "epoch": 23.197052114406038, "grad_norm": 0.24035851657390594, "learning_rate": 0.001, "loss": 1.905, "step": 396600 }, { "epoch": 23.202901093759138, "grad_norm": 0.20398515462875366, "learning_rate": 0.001, "loss": 1.8987, "step": 396700 }, { "epoch": 23.208750073112242, "grad_norm": 0.20178081095218658, "learning_rate": 0.001, "loss": 1.894, "step": 396800 }, { "epoch": 23.214599052465346, "grad_norm": 0.20035415887832642, "learning_rate": 0.001, "loss": 1.8967, "step": 396900 }, { "epoch": 23.22044803181845, "grad_norm": 0.19564373791217804, "learning_rate": 0.001, "loss": 1.906, "step": 397000 }, { "epoch": 23.22629701117155, "grad_norm": 0.26396578550338745, "learning_rate": 0.001, "loss": 1.9002, "step": 397100 }, { "epoch": 23.232145990524653, "grad_norm": 0.18955501914024353, "learning_rate": 0.001, "loss": 1.8896, "step": 397200 }, { "epoch": 23.237994969877757, "grad_norm": 0.24759498238563538, "learning_rate": 0.001, "loss": 1.893, "step": 397300 }, { "epoch": 23.243843949230858, "grad_norm": 0.20984406769275665, "learning_rate": 0.001, "loss": 1.8979, "step": 397400 }, { "epoch": 23.24969292858396, "grad_norm": 0.22963565587997437, "learning_rate": 0.001, "loss": 1.8986, "step": 397500 }, { "epoch": 23.255541907937065, "grad_norm": 0.20715981721878052, "learning_rate": 0.001, "loss": 1.8944, "step": 397600 }, { "epoch": 23.26139088729017, "grad_norm": 0.21671205759048462, "learning_rate": 0.001, "loss": 1.8979, "step": 397700 }, { "epoch": 23.26723986664327, "grad_norm": 0.19833967089653015, "learning_rate": 0.001, "loss": 1.8981, "step": 397800 }, { "epoch": 23.273088845996373, "grad_norm": 0.245306596159935, "learning_rate": 0.001, "loss": 1.9035, "step": 397900 }, { "epoch": 23.278937825349477, "grad_norm": 0.24659070372581482, "learning_rate": 0.001, "loss": 1.8996, "step": 398000 }, { "epoch": 23.28478680470258, "grad_norm": 0.2436733990907669, "learning_rate": 0.001, "loss": 1.9003, "step": 398100 }, { "epoch": 23.29063578405568, "grad_norm": 0.19674357771873474, "learning_rate": 0.001, "loss": 1.8963, "step": 398200 }, { "epoch": 23.296484763408785, "grad_norm": 0.1980125904083252, "learning_rate": 0.001, "loss": 1.8985, "step": 398300 }, { "epoch": 23.30233374276189, "grad_norm": 0.20121018588542938, "learning_rate": 0.001, "loss": 1.9045, "step": 398400 }, { "epoch": 23.308182722114992, "grad_norm": 0.2435746043920517, "learning_rate": 0.001, "loss": 1.8984, "step": 398500 }, { "epoch": 23.314031701468092, "grad_norm": 0.22335125505924225, "learning_rate": 0.001, "loss": 1.8977, "step": 398600 }, { "epoch": 23.319880680821196, "grad_norm": 0.2321440577507019, "learning_rate": 0.001, "loss": 1.8992, "step": 398700 }, { "epoch": 23.3257296601743, "grad_norm": 0.24104145169258118, "learning_rate": 0.001, "loss": 1.8987, "step": 398800 }, { "epoch": 23.331578639527404, "grad_norm": 0.23888644576072693, "learning_rate": 0.001, "loss": 1.9024, "step": 398900 }, { "epoch": 23.337427618880504, "grad_norm": 0.2630763351917267, "learning_rate": 0.001, "loss": 1.9062, "step": 399000 }, { "epoch": 23.343276598233608, "grad_norm": 0.21821962296962738, "learning_rate": 0.001, "loss": 1.9025, "step": 399100 }, { "epoch": 23.34912557758671, "grad_norm": 0.21924756467342377, "learning_rate": 0.001, "loss": 1.9053, "step": 399200 }, { "epoch": 23.354974556939815, "grad_norm": 0.19897079467773438, "learning_rate": 0.001, "loss": 1.9011, "step": 399300 }, { "epoch": 23.360823536292916, "grad_norm": 0.31570661067962646, "learning_rate": 0.001, "loss": 1.9018, "step": 399400 }, { "epoch": 23.36667251564602, "grad_norm": 0.2565517723560333, "learning_rate": 0.001, "loss": 1.9086, "step": 399500 }, { "epoch": 23.372521494999123, "grad_norm": 0.2247127741575241, "learning_rate": 0.001, "loss": 1.8935, "step": 399600 }, { "epoch": 23.378370474352227, "grad_norm": 0.21167594194412231, "learning_rate": 0.001, "loss": 1.8988, "step": 399700 }, { "epoch": 23.384219453705327, "grad_norm": 0.20979133248329163, "learning_rate": 0.001, "loss": 1.8987, "step": 399800 }, { "epoch": 23.39006843305843, "grad_norm": 0.19357717037200928, "learning_rate": 0.001, "loss": 1.9016, "step": 399900 }, { "epoch": 23.395917412411535, "grad_norm": 0.2570589482784271, "learning_rate": 0.001, "loss": 1.8967, "step": 400000 }, { "epoch": 23.395917412411535, "eval_ag_news_accuracy": 0.239703125, "eval_ag_news_bleu_score": 7.032513563104892, "eval_ag_news_bleu_score_sem": 0.4692212205999121, "eval_ag_news_emb_cos_sim": 0.7166081666946411, "eval_ag_news_emb_cos_sim_sem": 0.015508761629462242, "eval_ag_news_emb_top1_equal": 0.921875, "eval_ag_news_emb_top1_equal_sem": 0.023813825100660324, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.712697744369507, "eval_ag_news_n_ngrams_match_1": 13.9296875, "eval_ag_news_n_ngrams_match_2": 4.1953125, "eval_ag_news_n_ngrams_match_3": 1.609375, "eval_ag_news_num_pred_words": 47.2109375, "eval_ag_news_num_true_words": 46.2890625, "eval_ag_news_perplexity": 15.069875397702841, "eval_ag_news_pred_num_tokens": 73.1953125, "eval_ag_news_rouge_score": 0.28771770489152204, "eval_ag_news_runtime": 39.1703, "eval_ag_news_samples_per_second": 12.765, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.31968880657413623, "eval_ag_news_token_set_f1_sem": 0.009560321049515806, "eval_ag_news_token_set_precision": 0.2984178664985784, "eval_ag_news_token_set_recall": 0.3619041845139464, "eval_ag_news_true_num_tokens": 63.625, "step": 400000 }, { "epoch": 23.395917412411535, "eval_anthropic_toxic_prompts_accuracy": 0.101296875, "eval_anthropic_toxic_prompts_bleu_score": 41.853102593539646, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.7213546615534607, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8913455605506897, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009018261916935444, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02934655810211727, "eval_anthropic_toxic_prompts_loss": 1.2600555419921875, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.2734375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.40625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.3984375, "eval_anthropic_toxic_prompts_num_pred_words": 15.984375, "eval_anthropic_toxic_prompts_num_true_words": 15.4296875, "eval_anthropic_toxic_prompts_perplexity": 3.5256173017360006, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.875, "eval_anthropic_toxic_prompts_rouge_score": 0.6751018339450919, "eval_anthropic_toxic_prompts_runtime": 30.1701, "eval_anthropic_toxic_prompts_samples_per_second": 16.573, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.6939244422778872, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01913580961424049, "eval_anthropic_toxic_prompts_token_set_precision": 0.7021101715430466, "eval_anthropic_toxic_prompts_token_set_recall": 0.6943800730285855, "eval_anthropic_toxic_prompts_true_num_tokens": 19.0234375, "step": 400000 }, { "epoch": 23.395917412411535, "eval_arxiv_accuracy": 0.37521875, "eval_arxiv_bleu_score": 1.4703954125999825, "eval_arxiv_bleu_score_sem": 0.13515289477607303, "eval_arxiv_emb_cos_sim": 0.4705086350440979, "eval_arxiv_emb_cos_sim_sem": 0.01958305761218071, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.443722724914551, "eval_arxiv_n_ngrams_match_1": 11.8671875, "eval_arxiv_n_ngrams_match_2": 1.9921875, "eval_arxiv_n_ngrams_match_3": 0.3359375, "eval_arxiv_num_pred_words": 52.4375, "eval_arxiv_num_true_words": 86.0859375, "eval_arxiv_perplexity": 31.30327500795702, "eval_arxiv_pred_num_tokens": 124.9921875, "eval_arxiv_rouge_score": 0.1611792435369398, "eval_arxiv_runtime": 30.8503, "eval_arxiv_samples_per_second": 16.207, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.15631468070119836, "eval_arxiv_token_set_f1_sem": 0.00818522828644056, "eval_arxiv_token_set_precision": 0.10529301219820945, "eval_arxiv_token_set_recall": 0.41082127452707806, "eval_arxiv_true_num_tokens": 124.890625, "step": 400000 }, { "epoch": 23.395917412411535, "eval_python_code_alpaca_accuracy": 0.12815625, "eval_python_code_alpaca_bleu_score": 27.634041178607223, "eval_python_code_alpaca_bleu_score_sem": 1.573690044959428, "eval_python_code_alpaca_emb_cos_sim": 0.8728632926940918, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008853812702000141, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5493710041046143, "eval_python_code_alpaca_n_ngrams_match_1": 10.25, "eval_python_code_alpaca_n_ngrams_match_2": 5.6171875, "eval_python_code_alpaca_n_ngrams_match_3": 3.1875, "eval_python_code_alpaca_num_pred_words": 16.59375, "eval_python_code_alpaca_num_true_words": 17.6015625, "eval_python_code_alpaca_perplexity": 4.708507619002503, "eval_python_code_alpaca_pred_num_tokens": 22.515625, "eval_python_code_alpaca_rouge_score": 0.6095648624914975, "eval_python_code_alpaca_runtime": 30.4455, "eval_python_code_alpaca_samples_per_second": 16.423, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6217675665666385, "eval_python_code_alpaca_token_set_f1_sem": 0.013102520684605498, "eval_python_code_alpaca_token_set_precision": 0.6083623844349693, "eval_python_code_alpaca_token_set_recall": 0.6433300712310627, "eval_python_code_alpaca_true_num_tokens": 23.078125, "step": 400000 }, { "epoch": 23.395917412411535, "eval_wikibio_accuracy": 0.36503125, "eval_wikibio_bleu_score": 7.4229659465206055, "eval_wikibio_bleu_score_sem": 0.7157088599926079, "eval_wikibio_emb_cos_sim": 0.6050471663475037, "eval_wikibio_emb_cos_sim_sem": 0.021548422053456306, "eval_wikibio_emb_top1_equal": 0.953125, "eval_wikibio_emb_top1_equal_sem": 0.01875615119934082, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7235798835754395, "eval_wikibio_n_ngrams_match_1": 15.71875, "eval_wikibio_n_ngrams_match_2": 5.59375, "eval_wikibio_n_ngrams_match_3": 2.3671875, "eval_wikibio_num_pred_words": 53.6015625, "eval_wikibio_num_true_words": 54.1953125, "eval_wikibio_perplexity": 15.234763419620798, "eval_wikibio_pred_num_tokens": 107.7109375, "eval_wikibio_rouge_score": 0.2977352444467666, "eval_wikibio_runtime": 31.5154, "eval_wikibio_samples_per_second": 15.865, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32154376226733383, "eval_wikibio_token_set_f1_sem": 0.012349040992869998, "eval_wikibio_token_set_precision": 0.2815128259710156, "eval_wikibio_token_set_recall": 0.4255548350827625, "eval_wikibio_true_num_tokens": 103.0625, "step": 400000 }, { "epoch": 23.395917412411535, "eval_msmarco_accuracy": 0.389046875, "eval_msmarco_bleu_score": 15.192690522350183, "eval_msmarco_bleu_score_sem": 1.262007352582014, "eval_msmarco_emb_cos_sim": 0.7627973556518555, "eval_msmarco_emb_cos_sim_sem": 0.01688574068248272, "eval_msmarco_emb_top1_equal": 0.9140625, "eval_msmarco_emb_top1_equal_sem": 0.024870097637176514, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.8380281925201416, "eval_msmarco_n_ngrams_match_1": 28.3125, "eval_msmarco_n_ngrams_match_2": 12.0078125, "eval_msmarco_n_ngrams_match_3": 6.625, "eval_msmarco_num_pred_words": 65.234375, "eval_msmarco_num_true_words": 65.2265625, "eval_msmarco_perplexity": 6.28413493229324, "eval_msmarco_pred_num_tokens": 90.0, "eval_msmarco_rouge_score": 0.41649683051483966, "eval_msmarco_runtime": 27.6665, "eval_msmarco_samples_per_second": 18.072, "eval_msmarco_steps_per_second": 0.036, "eval_msmarco_token_set_f1": 0.44047610213342436, "eval_msmarco_token_set_f1_sem": 0.01395716322354114, "eval_msmarco_token_set_precision": 0.4034669700742495, "eval_msmarco_token_set_recall": 0.5032541082653947, "eval_msmarco_true_num_tokens": 83.65625, "step": 400000 }, { "epoch": 23.40176639176464, "grad_norm": 0.2155192643404007, "learning_rate": 0.001, "loss": 1.8966, "step": 400100 }, { "epoch": 23.40761537111774, "grad_norm": 0.213144451379776, "learning_rate": 0.001, "loss": 1.8983, "step": 400200 }, { "epoch": 23.413464350470843, "grad_norm": 0.2059832662343979, "learning_rate": 0.001, "loss": 1.9001, "step": 400300 }, { "epoch": 23.419313329823947, "grad_norm": 0.2005961388349533, "learning_rate": 0.001, "loss": 1.8992, "step": 400400 }, { "epoch": 23.425162309177047, "grad_norm": 0.18574480712413788, "learning_rate": 0.001, "loss": 1.9088, "step": 400500 }, { "epoch": 23.43101128853015, "grad_norm": 0.18923912942409515, "learning_rate": 0.001, "loss": 1.903, "step": 400600 }, { "epoch": 23.436860267883254, "grad_norm": 0.22256793081760406, "learning_rate": 0.001, "loss": 1.9022, "step": 400700 }, { "epoch": 23.44270924723636, "grad_norm": 0.20506998896598816, "learning_rate": 0.001, "loss": 1.8981, "step": 400800 }, { "epoch": 23.44855822658946, "grad_norm": 0.2681736648082733, "learning_rate": 0.001, "loss": 1.9042, "step": 400900 }, { "epoch": 23.454407205942562, "grad_norm": 0.28376513719558716, "learning_rate": 0.001, "loss": 1.9024, "step": 401000 }, { "epoch": 23.460256185295666, "grad_norm": 0.1835314780473709, "learning_rate": 0.001, "loss": 1.9014, "step": 401100 }, { "epoch": 23.46610516464877, "grad_norm": 0.1928459256887436, "learning_rate": 0.001, "loss": 1.9019, "step": 401200 }, { "epoch": 23.47195414400187, "grad_norm": 0.18418382108211517, "learning_rate": 0.001, "loss": 1.8961, "step": 401300 }, { "epoch": 23.477803123354974, "grad_norm": 0.24742212891578674, "learning_rate": 0.001, "loss": 1.8959, "step": 401400 }, { "epoch": 23.483652102708078, "grad_norm": 0.20503056049346924, "learning_rate": 0.001, "loss": 1.9051, "step": 401500 }, { "epoch": 23.48950108206118, "grad_norm": 0.18842190504074097, "learning_rate": 0.001, "loss": 1.9052, "step": 401600 }, { "epoch": 23.495350061414282, "grad_norm": 0.19931286573410034, "learning_rate": 0.001, "loss": 1.8971, "step": 401700 }, { "epoch": 23.501199040767386, "grad_norm": 0.23986418545246124, "learning_rate": 0.001, "loss": 1.9039, "step": 401800 }, { "epoch": 23.50704802012049, "grad_norm": 0.18850916624069214, "learning_rate": 0.001, "loss": 1.9026, "step": 401900 }, { "epoch": 23.512896999473593, "grad_norm": 0.1988588571548462, "learning_rate": 0.001, "loss": 1.8987, "step": 402000 }, { "epoch": 23.518745978826693, "grad_norm": 0.2475220263004303, "learning_rate": 0.001, "loss": 1.9045, "step": 402100 }, { "epoch": 23.524594958179797, "grad_norm": 0.1843584030866623, "learning_rate": 0.001, "loss": 1.9054, "step": 402200 }, { "epoch": 23.5304439375329, "grad_norm": 0.24012577533721924, "learning_rate": 0.001, "loss": 1.8981, "step": 402300 }, { "epoch": 23.536292916886005, "grad_norm": 0.2085958868265152, "learning_rate": 0.001, "loss": 1.9021, "step": 402400 }, { "epoch": 23.542141896239105, "grad_norm": 0.2660018503665924, "learning_rate": 0.001, "loss": 1.9014, "step": 402500 }, { "epoch": 23.54799087559221, "grad_norm": 0.23781470954418182, "learning_rate": 0.001, "loss": 1.8994, "step": 402600 }, { "epoch": 23.553839854945313, "grad_norm": 0.19284620881080627, "learning_rate": 0.001, "loss": 1.9006, "step": 402700 }, { "epoch": 23.559688834298417, "grad_norm": 0.2508600652217865, "learning_rate": 0.001, "loss": 1.9048, "step": 402800 }, { "epoch": 23.565537813651517, "grad_norm": 0.21328917145729065, "learning_rate": 0.001, "loss": 1.9008, "step": 402900 }, { "epoch": 23.57138679300462, "grad_norm": 0.1804395616054535, "learning_rate": 0.001, "loss": 1.9003, "step": 403000 }, { "epoch": 23.577235772357724, "grad_norm": 0.22304745018482208, "learning_rate": 0.001, "loss": 1.9049, "step": 403100 }, { "epoch": 23.583084751710828, "grad_norm": 0.21988126635551453, "learning_rate": 0.001, "loss": 1.9048, "step": 403200 }, { "epoch": 23.58893373106393, "grad_norm": 0.22336123883724213, "learning_rate": 0.001, "loss": 1.9044, "step": 403300 }, { "epoch": 23.594782710417032, "grad_norm": 0.18051481246948242, "learning_rate": 0.001, "loss": 1.9025, "step": 403400 }, { "epoch": 23.600631689770136, "grad_norm": 0.2120450884103775, "learning_rate": 0.001, "loss": 1.9031, "step": 403500 }, { "epoch": 23.606480669123236, "grad_norm": 0.21929648518562317, "learning_rate": 0.001, "loss": 1.9003, "step": 403600 }, { "epoch": 23.61232964847634, "grad_norm": 0.1877591460943222, "learning_rate": 0.001, "loss": 1.9014, "step": 403700 }, { "epoch": 23.618178627829444, "grad_norm": 0.16091662645339966, "learning_rate": 0.001, "loss": 1.9002, "step": 403800 }, { "epoch": 23.624027607182548, "grad_norm": 0.20146191120147705, "learning_rate": 0.001, "loss": 1.8994, "step": 403900 }, { "epoch": 23.629876586535648, "grad_norm": 0.23281629383563995, "learning_rate": 0.001, "loss": 1.902, "step": 404000 }, { "epoch": 23.63572556588875, "grad_norm": 0.23718826472759247, "learning_rate": 0.001, "loss": 1.9075, "step": 404100 }, { "epoch": 23.641574545241856, "grad_norm": 0.17336405813694, "learning_rate": 0.001, "loss": 1.9035, "step": 404200 }, { "epoch": 23.64742352459496, "grad_norm": 0.19006724655628204, "learning_rate": 0.001, "loss": 1.897, "step": 404300 }, { "epoch": 23.65327250394806, "grad_norm": 0.23735710978507996, "learning_rate": 0.001, "loss": 1.9015, "step": 404400 }, { "epoch": 23.659121483301163, "grad_norm": 0.26744768023490906, "learning_rate": 0.001, "loss": 1.9098, "step": 404500 }, { "epoch": 23.664970462654267, "grad_norm": 0.18619570136070251, "learning_rate": 0.001, "loss": 1.8985, "step": 404600 }, { "epoch": 23.67081944200737, "grad_norm": 0.1717088520526886, "learning_rate": 0.001, "loss": 1.9044, "step": 404700 }, { "epoch": 23.67666842136047, "grad_norm": 0.22604568302631378, "learning_rate": 0.001, "loss": 1.9046, "step": 404800 }, { "epoch": 23.682517400713575, "grad_norm": 0.20733994245529175, "learning_rate": 0.001, "loss": 1.9024, "step": 404900 }, { "epoch": 23.68836638006668, "grad_norm": 0.21428023278713226, "learning_rate": 0.001, "loss": 1.9001, "step": 405000 }, { "epoch": 23.694215359419783, "grad_norm": 0.25771984457969666, "learning_rate": 0.001, "loss": 1.8988, "step": 405100 }, { "epoch": 23.700064338772883, "grad_norm": 0.3154093027114868, "learning_rate": 0.001, "loss": 1.9015, "step": 405200 }, { "epoch": 23.705913318125987, "grad_norm": 0.18939952552318573, "learning_rate": 0.001, "loss": 1.902, "step": 405300 }, { "epoch": 23.71176229747909, "grad_norm": 0.18232789635658264, "learning_rate": 0.001, "loss": 1.9042, "step": 405400 }, { "epoch": 23.717611276832194, "grad_norm": 0.1924862414598465, "learning_rate": 0.001, "loss": 1.9086, "step": 405500 }, { "epoch": 23.723460256185295, "grad_norm": 0.16743114590644836, "learning_rate": 0.001, "loss": 1.9041, "step": 405600 }, { "epoch": 23.7293092355384, "grad_norm": 0.21700453758239746, "learning_rate": 0.001, "loss": 1.9006, "step": 405700 }, { "epoch": 23.735158214891502, "grad_norm": 0.26601356267929077, "learning_rate": 0.001, "loss": 1.9106, "step": 405800 }, { "epoch": 23.741007194244606, "grad_norm": 0.18745258450508118, "learning_rate": 0.001, "loss": 1.9021, "step": 405900 }, { "epoch": 23.746856173597706, "grad_norm": 0.200617253780365, "learning_rate": 0.001, "loss": 1.9056, "step": 406000 }, { "epoch": 23.75270515295081, "grad_norm": 0.18553468585014343, "learning_rate": 0.001, "loss": 1.8999, "step": 406100 }, { "epoch": 23.758554132303914, "grad_norm": 0.2508409023284912, "learning_rate": 0.001, "loss": 1.9051, "step": 406200 }, { "epoch": 23.764403111657018, "grad_norm": 0.18505387008190155, "learning_rate": 0.001, "loss": 1.9084, "step": 406300 }, { "epoch": 23.770252091010118, "grad_norm": 0.19688232243061066, "learning_rate": 0.001, "loss": 1.9032, "step": 406400 }, { "epoch": 23.77610107036322, "grad_norm": 0.21662351489067078, "learning_rate": 0.001, "loss": 1.9106, "step": 406500 }, { "epoch": 23.781950049716325, "grad_norm": 0.22378075122833252, "learning_rate": 0.001, "loss": 1.9068, "step": 406600 }, { "epoch": 23.787799029069426, "grad_norm": 0.21002084016799927, "learning_rate": 0.001, "loss": 1.8993, "step": 406700 }, { "epoch": 23.79364800842253, "grad_norm": 0.18411394953727722, "learning_rate": 0.001, "loss": 1.9022, "step": 406800 }, { "epoch": 23.799496987775633, "grad_norm": 0.2385079562664032, "learning_rate": 0.001, "loss": 1.8972, "step": 406900 }, { "epoch": 23.805345967128737, "grad_norm": 0.27263951301574707, "learning_rate": 0.001, "loss": 1.9008, "step": 407000 }, { "epoch": 23.811194946481837, "grad_norm": 0.18925315141677856, "learning_rate": 0.001, "loss": 1.8984, "step": 407100 }, { "epoch": 23.81704392583494, "grad_norm": 0.2156182825565338, "learning_rate": 0.001, "loss": 1.9037, "step": 407200 }, { "epoch": 23.822892905188045, "grad_norm": 0.1821366399526596, "learning_rate": 0.001, "loss": 1.9044, "step": 407300 }, { "epoch": 23.82874188454115, "grad_norm": 0.23712478578090668, "learning_rate": 0.001, "loss": 1.9055, "step": 407400 }, { "epoch": 23.83459086389425, "grad_norm": 0.22668196260929108, "learning_rate": 0.001, "loss": 1.9056, "step": 407500 }, { "epoch": 23.840439843247353, "grad_norm": 0.16702060401439667, "learning_rate": 0.001, "loss": 1.9045, "step": 407600 }, { "epoch": 23.846288822600457, "grad_norm": 0.20742842555046082, "learning_rate": 0.001, "loss": 1.9031, "step": 407700 }, { "epoch": 23.85213780195356, "grad_norm": 0.2082674503326416, "learning_rate": 0.001, "loss": 1.9025, "step": 407800 }, { "epoch": 23.85798678130666, "grad_norm": 0.22514720261096954, "learning_rate": 0.001, "loss": 1.9054, "step": 407900 }, { "epoch": 23.863835760659764, "grad_norm": 0.2121286243200302, "learning_rate": 0.001, "loss": 1.9082, "step": 408000 }, { "epoch": 23.869684740012868, "grad_norm": 0.18738579750061035, "learning_rate": 0.001, "loss": 1.9027, "step": 408100 }, { "epoch": 23.875533719365972, "grad_norm": 0.22428447008132935, "learning_rate": 0.001, "loss": 1.9013, "step": 408200 }, { "epoch": 23.881382698719072, "grad_norm": 0.2545178234577179, "learning_rate": 0.001, "loss": 1.9072, "step": 408300 }, { "epoch": 23.887231678072176, "grad_norm": 0.2254296988248825, "learning_rate": 0.001, "loss": 1.9047, "step": 408400 }, { "epoch": 23.89308065742528, "grad_norm": 0.2299213707447052, "learning_rate": 0.001, "loss": 1.8992, "step": 408500 }, { "epoch": 23.898929636778384, "grad_norm": 0.22911620140075684, "learning_rate": 0.001, "loss": 1.9016, "step": 408600 }, { "epoch": 23.904778616131484, "grad_norm": 0.24832342565059662, "learning_rate": 0.001, "loss": 1.9042, "step": 408700 }, { "epoch": 23.910627595484588, "grad_norm": 0.20802022516727448, "learning_rate": 0.001, "loss": 1.9051, "step": 408800 }, { "epoch": 23.91647657483769, "grad_norm": 0.2508595585823059, "learning_rate": 0.001, "loss": 1.9002, "step": 408900 }, { "epoch": 23.922325554190795, "grad_norm": 0.23969893157482147, "learning_rate": 0.001, "loss": 1.9029, "step": 409000 }, { "epoch": 23.928174533543896, "grad_norm": 0.23272080719470978, "learning_rate": 0.001, "loss": 1.9021, "step": 409100 }, { "epoch": 23.934023512897, "grad_norm": 0.21547311544418335, "learning_rate": 0.001, "loss": 1.8994, "step": 409200 }, { "epoch": 23.939872492250103, "grad_norm": 0.193403959274292, "learning_rate": 0.001, "loss": 1.9031, "step": 409300 }, { "epoch": 23.945721471603207, "grad_norm": 0.2730841636657715, "learning_rate": 0.001, "loss": 1.9027, "step": 409400 }, { "epoch": 23.951570450956307, "grad_norm": 0.19005228579044342, "learning_rate": 0.001, "loss": 1.9056, "step": 409500 }, { "epoch": 23.95741943030941, "grad_norm": 0.25881069898605347, "learning_rate": 0.001, "loss": 1.9059, "step": 409600 }, { "epoch": 23.963268409662515, "grad_norm": 0.17168335616588593, "learning_rate": 0.001, "loss": 1.9018, "step": 409700 }, { "epoch": 23.969117389015615, "grad_norm": 0.24526311457157135, "learning_rate": 0.001, "loss": 1.9006, "step": 409800 }, { "epoch": 23.97496636836872, "grad_norm": 0.20231425762176514, "learning_rate": 0.001, "loss": 1.9066, "step": 409900 }, { "epoch": 23.980815347721823, "grad_norm": 0.23532554507255554, "learning_rate": 0.001, "loss": 1.9044, "step": 410000 }, { "epoch": 23.986664327074926, "grad_norm": 0.2259921133518219, "learning_rate": 0.001, "loss": 1.9031, "step": 410100 }, { "epoch": 23.992513306428027, "grad_norm": 0.17737171053886414, "learning_rate": 0.001, "loss": 1.9009, "step": 410200 }, { "epoch": 23.99836228578113, "grad_norm": 0.18650132417678833, "learning_rate": 0.001, "loss": 1.9023, "step": 410300 }, { "epoch": 24.004211265134234, "grad_norm": 0.26972365379333496, "learning_rate": 0.001, "loss": 1.8926, "step": 410400 }, { "epoch": 24.010060244487338, "grad_norm": 0.2758162021636963, "learning_rate": 0.001, "loss": 1.8872, "step": 410500 }, { "epoch": 24.01590922384044, "grad_norm": 0.21239173412322998, "learning_rate": 0.001, "loss": 1.8901, "step": 410600 }, { "epoch": 24.021758203193542, "grad_norm": 0.20194876194000244, "learning_rate": 0.001, "loss": 1.8895, "step": 410700 }, { "epoch": 24.027607182546646, "grad_norm": 0.23010669648647308, "learning_rate": 0.001, "loss": 1.8886, "step": 410800 }, { "epoch": 24.03345616189975, "grad_norm": 0.1991807371377945, "learning_rate": 0.001, "loss": 1.889, "step": 410900 }, { "epoch": 24.03930514125285, "grad_norm": 0.2640625834465027, "learning_rate": 0.001, "loss": 1.8944, "step": 411000 }, { "epoch": 24.045154120605954, "grad_norm": 0.2103489339351654, "learning_rate": 0.001, "loss": 1.8913, "step": 411100 }, { "epoch": 24.051003099959058, "grad_norm": 0.2320561707019806, "learning_rate": 0.001, "loss": 1.8897, "step": 411200 }, { "epoch": 24.05685207931216, "grad_norm": 0.2785986661911011, "learning_rate": 0.001, "loss": 1.8905, "step": 411300 }, { "epoch": 24.06270105866526, "grad_norm": 0.20653510093688965, "learning_rate": 0.001, "loss": 1.8928, "step": 411400 }, { "epoch": 24.068550038018365, "grad_norm": 0.20237761735916138, "learning_rate": 0.001, "loss": 1.8824, "step": 411500 }, { "epoch": 24.07439901737147, "grad_norm": 0.2118833363056183, "learning_rate": 0.001, "loss": 1.8884, "step": 411600 }, { "epoch": 24.080247996724573, "grad_norm": 0.18795351684093475, "learning_rate": 0.001, "loss": 1.8866, "step": 411700 }, { "epoch": 24.086096976077673, "grad_norm": 0.23791059851646423, "learning_rate": 0.001, "loss": 1.89, "step": 411800 }, { "epoch": 24.091945955430777, "grad_norm": 0.1867871731519699, "learning_rate": 0.001, "loss": 1.8947, "step": 411900 }, { "epoch": 24.09779493478388, "grad_norm": 0.2574344575405121, "learning_rate": 0.001, "loss": 1.8903, "step": 412000 }, { "epoch": 24.103643914136985, "grad_norm": 0.25718456506729126, "learning_rate": 0.001, "loss": 1.8934, "step": 412100 }, { "epoch": 24.109492893490085, "grad_norm": 0.21035535633563995, "learning_rate": 0.001, "loss": 1.8839, "step": 412200 }, { "epoch": 24.11534187284319, "grad_norm": 0.2716791033744812, "learning_rate": 0.001, "loss": 1.8869, "step": 412300 }, { "epoch": 24.121190852196293, "grad_norm": 0.24003806710243225, "learning_rate": 0.001, "loss": 1.895, "step": 412400 }, { "epoch": 24.127039831549396, "grad_norm": 0.25477713346481323, "learning_rate": 0.001, "loss": 1.8914, "step": 412500 }, { "epoch": 24.132888810902497, "grad_norm": 0.2194405347108841, "learning_rate": 0.001, "loss": 1.8932, "step": 412600 }, { "epoch": 24.1387377902556, "grad_norm": 0.18644361197948456, "learning_rate": 0.001, "loss": 1.8915, "step": 412700 }, { "epoch": 24.144586769608704, "grad_norm": 0.2577097415924072, "learning_rate": 0.001, "loss": 1.8873, "step": 412800 }, { "epoch": 24.150435748961804, "grad_norm": 0.18777592480182648, "learning_rate": 0.001, "loss": 1.8906, "step": 412900 }, { "epoch": 24.15628472831491, "grad_norm": 0.31828758120536804, "learning_rate": 0.001, "loss": 1.899, "step": 413000 }, { "epoch": 24.162133707668012, "grad_norm": 0.22349265217781067, "learning_rate": 0.001, "loss": 1.8908, "step": 413100 }, { "epoch": 24.167982687021116, "grad_norm": 0.20296674966812134, "learning_rate": 0.001, "loss": 1.893, "step": 413200 }, { "epoch": 24.173831666374216, "grad_norm": 0.21213190257549286, "learning_rate": 0.001, "loss": 1.8891, "step": 413300 }, { "epoch": 24.17968064572732, "grad_norm": 0.24335753917694092, "learning_rate": 0.001, "loss": 1.8854, "step": 413400 }, { "epoch": 24.185529625080424, "grad_norm": 0.18110397458076477, "learning_rate": 0.001, "loss": 1.8924, "step": 413500 }, { "epoch": 24.191378604433527, "grad_norm": 0.2305908054113388, "learning_rate": 0.001, "loss": 1.8955, "step": 413600 }, { "epoch": 24.197227583786628, "grad_norm": 0.2573641836643219, "learning_rate": 0.001, "loss": 1.8936, "step": 413700 }, { "epoch": 24.20307656313973, "grad_norm": 0.2934132516384125, "learning_rate": 0.001, "loss": 1.8937, "step": 413800 }, { "epoch": 24.208925542492835, "grad_norm": 0.21363455057144165, "learning_rate": 0.001, "loss": 1.8982, "step": 413900 }, { "epoch": 24.21477452184594, "grad_norm": 0.19383348524570465, "learning_rate": 0.001, "loss": 1.8919, "step": 414000 }, { "epoch": 24.22062350119904, "grad_norm": 0.19072090089321136, "learning_rate": 0.001, "loss": 1.8966, "step": 414100 }, { "epoch": 24.226472480552143, "grad_norm": 0.2257453054189682, "learning_rate": 0.001, "loss": 1.8908, "step": 414200 }, { "epoch": 24.232321459905247, "grad_norm": 0.23823049664497375, "learning_rate": 0.001, "loss": 1.8924, "step": 414300 }, { "epoch": 24.23817043925835, "grad_norm": 0.21592973172664642, "learning_rate": 0.001, "loss": 1.8947, "step": 414400 }, { "epoch": 24.24401941861145, "grad_norm": 0.23154719173908234, "learning_rate": 0.001, "loss": 1.8949, "step": 414500 }, { "epoch": 24.249868397964555, "grad_norm": 0.20751430094242096, "learning_rate": 0.001, "loss": 1.8953, "step": 414600 }, { "epoch": 24.25571737731766, "grad_norm": 0.20069177448749542, "learning_rate": 0.001, "loss": 1.8899, "step": 414700 }, { "epoch": 24.261566356670762, "grad_norm": 0.28975602984428406, "learning_rate": 0.001, "loss": 1.89, "step": 414800 }, { "epoch": 24.267415336023863, "grad_norm": 0.2737014591693878, "learning_rate": 0.001, "loss": 1.892, "step": 414900 }, { "epoch": 24.273264315376966, "grad_norm": 0.2128576785326004, "learning_rate": 0.001, "loss": 1.8892, "step": 415000 }, { "epoch": 24.27911329473007, "grad_norm": 0.24244040250778198, "learning_rate": 0.001, "loss": 1.8924, "step": 415100 }, { "epoch": 24.284962274083174, "grad_norm": 0.17385797202587128, "learning_rate": 0.001, "loss": 1.8995, "step": 415200 }, { "epoch": 24.290811253436274, "grad_norm": 0.22673438489437103, "learning_rate": 0.001, "loss": 1.8908, "step": 415300 }, { "epoch": 24.296660232789378, "grad_norm": 0.22172342240810394, "learning_rate": 0.001, "loss": 1.8987, "step": 415400 }, { "epoch": 24.302509212142482, "grad_norm": 0.23876072466373444, "learning_rate": 0.001, "loss": 1.8968, "step": 415500 }, { "epoch": 24.308358191495586, "grad_norm": 0.22238725423812866, "learning_rate": 0.001, "loss": 1.8951, "step": 415600 }, { "epoch": 24.314207170848686, "grad_norm": 0.1996755599975586, "learning_rate": 0.001, "loss": 1.8986, "step": 415700 }, { "epoch": 24.32005615020179, "grad_norm": 0.20795217156410217, "learning_rate": 0.001, "loss": 1.8934, "step": 415800 }, { "epoch": 24.325905129554894, "grad_norm": 0.24831439554691315, "learning_rate": 0.001, "loss": 1.8941, "step": 415900 }, { "epoch": 24.331754108907994, "grad_norm": 0.21572937071323395, "learning_rate": 0.001, "loss": 1.8945, "step": 416000 }, { "epoch": 24.337603088261098, "grad_norm": 0.24276337027549744, "learning_rate": 0.001, "loss": 1.8976, "step": 416100 }, { "epoch": 24.3434520676142, "grad_norm": 0.1984441578388214, "learning_rate": 0.001, "loss": 1.8955, "step": 416200 }, { "epoch": 24.349301046967305, "grad_norm": 0.1999739110469818, "learning_rate": 0.001, "loss": 1.8893, "step": 416300 }, { "epoch": 24.355150026320405, "grad_norm": 0.21381621062755585, "learning_rate": 0.001, "loss": 1.8973, "step": 416400 }, { "epoch": 24.36099900567351, "grad_norm": 0.3461935222148895, "learning_rate": 0.001, "loss": 1.8962, "step": 416500 }, { "epoch": 24.366847985026613, "grad_norm": 0.26652729511260986, "learning_rate": 0.001, "loss": 1.8994, "step": 416600 }, { "epoch": 24.372696964379717, "grad_norm": 0.2685411870479584, "learning_rate": 0.001, "loss": 1.8961, "step": 416700 }, { "epoch": 24.378545943732817, "grad_norm": 0.2132881134748459, "learning_rate": 0.001, "loss": 1.8921, "step": 416800 }, { "epoch": 24.38439492308592, "grad_norm": 0.19383510947227478, "learning_rate": 0.001, "loss": 1.9027, "step": 416900 }, { "epoch": 24.390243902439025, "grad_norm": 0.3166053295135498, "learning_rate": 0.001, "loss": 1.8944, "step": 417000 }, { "epoch": 24.39609288179213, "grad_norm": 0.24340477585792542, "learning_rate": 0.001, "loss": 1.8968, "step": 417100 }, { "epoch": 24.40194186114523, "grad_norm": 0.19761884212493896, "learning_rate": 0.001, "loss": 1.8984, "step": 417200 }, { "epoch": 24.407790840498333, "grad_norm": 0.2518312633037567, "learning_rate": 0.001, "loss": 1.9001, "step": 417300 }, { "epoch": 24.413639819851436, "grad_norm": 0.1969878077507019, "learning_rate": 0.001, "loss": 1.8968, "step": 417400 }, { "epoch": 24.41948879920454, "grad_norm": 0.20227746665477753, "learning_rate": 0.001, "loss": 1.8948, "step": 417500 }, { "epoch": 24.42533777855764, "grad_norm": 0.2867928147315979, "learning_rate": 0.001, "loss": 1.8949, "step": 417600 }, { "epoch": 24.431186757910744, "grad_norm": 0.18276630342006683, "learning_rate": 0.001, "loss": 1.8954, "step": 417700 }, { "epoch": 24.437035737263848, "grad_norm": 0.23233886063098907, "learning_rate": 0.001, "loss": 1.8941, "step": 417800 }, { "epoch": 24.442884716616952, "grad_norm": 0.24963293969631195, "learning_rate": 0.001, "loss": 1.8952, "step": 417900 }, { "epoch": 24.448733695970052, "grad_norm": 0.1786150485277176, "learning_rate": 0.001, "loss": 1.8919, "step": 418000 }, { "epoch": 24.454582675323156, "grad_norm": 0.32827287912368774, "learning_rate": 0.001, "loss": 1.8951, "step": 418100 }, { "epoch": 24.46043165467626, "grad_norm": 0.23454314470291138, "learning_rate": 0.001, "loss": 1.8923, "step": 418200 }, { "epoch": 24.466280634029363, "grad_norm": 0.21007879078388214, "learning_rate": 0.001, "loss": 1.8943, "step": 418300 }, { "epoch": 24.472129613382464, "grad_norm": 0.24889536201953888, "learning_rate": 0.001, "loss": 1.8926, "step": 418400 }, { "epoch": 24.477978592735568, "grad_norm": 0.18878929316997528, "learning_rate": 0.001, "loss": 1.8989, "step": 418500 }, { "epoch": 24.48382757208867, "grad_norm": 0.19817408919334412, "learning_rate": 0.001, "loss": 1.8967, "step": 418600 }, { "epoch": 24.489676551441775, "grad_norm": 0.29379868507385254, "learning_rate": 0.001, "loss": 1.8935, "step": 418700 }, { "epoch": 24.495525530794875, "grad_norm": 0.21886436641216278, "learning_rate": 0.001, "loss": 1.8991, "step": 418800 }, { "epoch": 24.50137451014798, "grad_norm": 0.22582994401454926, "learning_rate": 0.001, "loss": 1.9013, "step": 418900 }, { "epoch": 24.507223489501083, "grad_norm": 0.2755979001522064, "learning_rate": 0.001, "loss": 1.8963, "step": 419000 }, { "epoch": 24.513072468854183, "grad_norm": 0.3320293426513672, "learning_rate": 0.001, "loss": 1.8953, "step": 419100 }, { "epoch": 24.518921448207287, "grad_norm": 0.2123277485370636, "learning_rate": 0.001, "loss": 1.8941, "step": 419200 }, { "epoch": 24.52477042756039, "grad_norm": 0.22214192152023315, "learning_rate": 0.001, "loss": 1.8939, "step": 419300 }, { "epoch": 24.530619406913495, "grad_norm": 0.2190861999988556, "learning_rate": 0.001, "loss": 1.8901, "step": 419400 }, { "epoch": 24.536468386266595, "grad_norm": 0.18015910685062408, "learning_rate": 0.001, "loss": 1.8942, "step": 419500 }, { "epoch": 24.5423173656197, "grad_norm": 0.2224213033914566, "learning_rate": 0.001, "loss": 1.8944, "step": 419600 }, { "epoch": 24.548166344972802, "grad_norm": 0.22241781651973724, "learning_rate": 0.001, "loss": 1.8914, "step": 419700 }, { "epoch": 24.554015324325906, "grad_norm": 0.25244635343551636, "learning_rate": 0.001, "loss": 1.892, "step": 419800 }, { "epoch": 24.559864303679007, "grad_norm": 0.24286051094532013, "learning_rate": 0.001, "loss": 1.8958, "step": 419900 }, { "epoch": 24.56571328303211, "grad_norm": 0.16787032783031464, "learning_rate": 0.001, "loss": 1.9006, "step": 420000 }, { "epoch": 24.571562262385214, "grad_norm": 0.2615216374397278, "learning_rate": 0.001, "loss": 1.892, "step": 420100 }, { "epoch": 24.577411241738318, "grad_norm": 0.19491593539714813, "learning_rate": 0.001, "loss": 1.8914, "step": 420200 }, { "epoch": 24.583260221091418, "grad_norm": 0.22721043229103088, "learning_rate": 0.001, "loss": 1.8958, "step": 420300 }, { "epoch": 24.589109200444522, "grad_norm": 0.2356017678976059, "learning_rate": 0.001, "loss": 1.895, "step": 420400 }, { "epoch": 24.594958179797626, "grad_norm": 0.2052108496427536, "learning_rate": 0.001, "loss": 1.8925, "step": 420500 }, { "epoch": 24.60080715915073, "grad_norm": 0.26090937852859497, "learning_rate": 0.001, "loss": 1.8983, "step": 420600 }, { "epoch": 24.60665613850383, "grad_norm": 0.1934508979320526, "learning_rate": 0.001, "loss": 1.9012, "step": 420700 }, { "epoch": 24.612505117856934, "grad_norm": 0.23032303154468536, "learning_rate": 0.001, "loss": 1.8964, "step": 420800 }, { "epoch": 24.618354097210037, "grad_norm": 0.22202017903327942, "learning_rate": 0.001, "loss": 1.8993, "step": 420900 }, { "epoch": 24.62420307656314, "grad_norm": 0.20868362486362457, "learning_rate": 0.001, "loss": 1.8903, "step": 421000 }, { "epoch": 24.63005205591624, "grad_norm": 0.22260138392448425, "learning_rate": 0.001, "loss": 1.9029, "step": 421100 }, { "epoch": 24.635901035269345, "grad_norm": 0.22743190824985504, "learning_rate": 0.001, "loss": 1.8977, "step": 421200 }, { "epoch": 24.64175001462245, "grad_norm": 0.21828758716583252, "learning_rate": 0.001, "loss": 1.8964, "step": 421300 }, { "epoch": 24.647598993975553, "grad_norm": 0.21900440752506256, "learning_rate": 0.001, "loss": 1.8977, "step": 421400 }, { "epoch": 24.653447973328653, "grad_norm": 0.18862201273441315, "learning_rate": 0.001, "loss": 1.8903, "step": 421500 }, { "epoch": 24.659296952681757, "grad_norm": 0.20794209837913513, "learning_rate": 0.001, "loss": 1.8933, "step": 421600 }, { "epoch": 24.66514593203486, "grad_norm": 0.3911062777042389, "learning_rate": 0.001, "loss": 1.8953, "step": 421700 }, { "epoch": 24.670994911387965, "grad_norm": 0.29312580823898315, "learning_rate": 0.001, "loss": 1.8993, "step": 421800 }, { "epoch": 24.676843890741065, "grad_norm": 0.1879030466079712, "learning_rate": 0.001, "loss": 1.8972, "step": 421900 }, { "epoch": 24.68269287009417, "grad_norm": 0.20531439781188965, "learning_rate": 0.001, "loss": 1.8924, "step": 422000 }, { "epoch": 24.688541849447272, "grad_norm": 0.21441830694675446, "learning_rate": 0.001, "loss": 1.8938, "step": 422100 }, { "epoch": 24.694390828800373, "grad_norm": 0.1957464963197708, "learning_rate": 0.001, "loss": 1.8946, "step": 422200 }, { "epoch": 24.700239808153476, "grad_norm": 0.19523583352565765, "learning_rate": 0.001, "loss": 1.8996, "step": 422300 }, { "epoch": 24.70608878750658, "grad_norm": 0.2789050340652466, "learning_rate": 0.001, "loss": 1.8978, "step": 422400 }, { "epoch": 24.711937766859684, "grad_norm": 0.3296523094177246, "learning_rate": 0.001, "loss": 1.9032, "step": 422500 }, { "epoch": 24.717786746212784, "grad_norm": 0.21078480780124664, "learning_rate": 0.001, "loss": 1.8901, "step": 422600 }, { "epoch": 24.723635725565888, "grad_norm": 0.23667657375335693, "learning_rate": 0.001, "loss": 1.8912, "step": 422700 }, { "epoch": 24.729484704918992, "grad_norm": 0.22749637067317963, "learning_rate": 0.001, "loss": 1.8986, "step": 422800 }, { "epoch": 24.735333684272096, "grad_norm": 0.20549991726875305, "learning_rate": 0.001, "loss": 1.8948, "step": 422900 }, { "epoch": 24.741182663625196, "grad_norm": 0.24276116490364075, "learning_rate": 0.001, "loss": 1.893, "step": 423000 }, { "epoch": 24.7470316429783, "grad_norm": 0.2225373089313507, "learning_rate": 0.001, "loss": 1.8937, "step": 423100 }, { "epoch": 24.752880622331404, "grad_norm": 0.28243616223335266, "learning_rate": 0.001, "loss": 1.9002, "step": 423200 }, { "epoch": 24.758729601684507, "grad_norm": 0.2986186742782593, "learning_rate": 0.001, "loss": 1.9003, "step": 423300 }, { "epoch": 24.764578581037608, "grad_norm": 0.20141303539276123, "learning_rate": 0.001, "loss": 1.8963, "step": 423400 }, { "epoch": 24.77042756039071, "grad_norm": 0.24270717799663544, "learning_rate": 0.001, "loss": 1.9025, "step": 423500 }, { "epoch": 24.776276539743815, "grad_norm": 0.2254769206047058, "learning_rate": 0.001, "loss": 1.8941, "step": 423600 }, { "epoch": 24.78212551909692, "grad_norm": 0.2601582705974579, "learning_rate": 0.001, "loss": 1.8962, "step": 423700 }, { "epoch": 24.78797449845002, "grad_norm": 0.2198876291513443, "learning_rate": 0.001, "loss": 1.8944, "step": 423800 }, { "epoch": 24.793823477803123, "grad_norm": 0.20391786098480225, "learning_rate": 0.001, "loss": 1.9025, "step": 423900 }, { "epoch": 24.799672457156227, "grad_norm": 0.20778092741966248, "learning_rate": 0.001, "loss": 1.9045, "step": 424000 }, { "epoch": 24.80552143650933, "grad_norm": 0.15558494627475739, "learning_rate": 0.001, "loss": 1.8945, "step": 424100 }, { "epoch": 24.81137041586243, "grad_norm": 0.21044482290744781, "learning_rate": 0.001, "loss": 1.9012, "step": 424200 }, { "epoch": 24.817219395215535, "grad_norm": 0.21249163150787354, "learning_rate": 0.001, "loss": 1.9002, "step": 424300 }, { "epoch": 24.82306837456864, "grad_norm": 0.27505314350128174, "learning_rate": 0.001, "loss": 1.8936, "step": 424400 }, { "epoch": 24.828917353921742, "grad_norm": 0.22120551764965057, "learning_rate": 0.001, "loss": 1.8969, "step": 424500 }, { "epoch": 24.834766333274843, "grad_norm": 0.2744581699371338, "learning_rate": 0.001, "loss": 1.8937, "step": 424600 }, { "epoch": 24.840615312627946, "grad_norm": 0.22536687552928925, "learning_rate": 0.001, "loss": 1.9029, "step": 424700 }, { "epoch": 24.84646429198105, "grad_norm": 0.22701521217823029, "learning_rate": 0.001, "loss": 1.9039, "step": 424800 }, { "epoch": 24.852313271334154, "grad_norm": 0.20300044119358063, "learning_rate": 0.001, "loss": 1.8956, "step": 424900 }, { "epoch": 24.858162250687254, "grad_norm": 0.20936231315135956, "learning_rate": 0.001, "loss": 1.8952, "step": 425000 }, { "epoch": 24.858162250687254, "eval_ag_news_accuracy": 0.238609375, "eval_ag_news_bleu_score": 6.672299924062402, "eval_ag_news_bleu_score_sem": 0.6261300893597308, "eval_ag_news_emb_cos_sim": 0.7145480513572693, "eval_ag_news_emb_cos_sim_sem": 0.015047293156385422, "eval_ag_news_emb_top1_equal": 0.9140625, "eval_ag_news_emb_top1_equal_sem": 0.024870097637176514, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7275314331054688, "eval_ag_news_n_ngrams_match_1": 13.109375, "eval_ag_news_n_ngrams_match_2": 3.4921875, "eval_ag_news_n_ngrams_match_3": 1.234375, "eval_ag_news_num_pred_words": 43.5859375, "eval_ag_news_num_true_words": 43.6484375, "eval_ag_news_perplexity": 15.295083442139907, "eval_ag_news_pred_num_tokens": 66.1796875, "eval_ag_news_rouge_score": 0.2911458473412264, "eval_ag_news_runtime": 38.0445, "eval_ag_news_samples_per_second": 13.142, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.32468066555306696, "eval_ag_news_token_set_f1_sem": 0.010427173984608372, "eval_ag_news_token_set_precision": 0.30559059663864363, "eval_ag_news_token_set_recall": 0.35420359319654604, "eval_ag_news_true_num_tokens": 59.9375, "step": 425000 }, { "epoch": 24.858162250687254, "eval_anthropic_toxic_prompts_accuracy": 0.10121875, "eval_anthropic_toxic_prompts_bleu_score": 43.83555840094491, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.751482031383623, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9062340259552002, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.00796433910727501, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1640625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.032861675748836264, "eval_anthropic_toxic_prompts_loss": 1.2220417261123657, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.2265625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.71875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.75, "eval_anthropic_toxic_prompts_num_pred_words": 14.46875, "eval_anthropic_toxic_prompts_num_true_words": 15.0546875, "eval_anthropic_toxic_prompts_perplexity": 3.394110508064347, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.6640625, "eval_anthropic_toxic_prompts_rouge_score": 0.6901559204555638, "eval_anthropic_toxic_prompts_runtime": 28.2274, "eval_anthropic_toxic_prompts_samples_per_second": 17.713, "eval_anthropic_toxic_prompts_steps_per_second": 0.035, "eval_anthropic_toxic_prompts_token_set_f1": 0.7089385387695123, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018504385412190558, "eval_anthropic_toxic_prompts_token_set_precision": 0.7031141714937422, "eval_anthropic_toxic_prompts_token_set_recall": 0.7200826258439481, "eval_anthropic_toxic_prompts_true_num_tokens": 18.640625, "step": 425000 }, { "epoch": 24.858162250687254, "eval_arxiv_accuracy": 0.36721875, "eval_arxiv_bleu_score": 1.6159733610188036, "eval_arxiv_bleu_score_sem": 0.16117964210583757, "eval_arxiv_emb_cos_sim": 0.4686565697193146, "eval_arxiv_emb_cos_sim_sem": 0.01893405243754387, "eval_arxiv_emb_top1_equal": 0.8984375, "eval_arxiv_emb_top1_equal_sem": 0.026804566383361816, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.469193696975708, "eval_arxiv_n_ngrams_match_1": 11.8046875, "eval_arxiv_n_ngrams_match_2": 2.140625, "eval_arxiv_n_ngrams_match_3": 0.5390625, "eval_arxiv_num_pred_words": 49.765625, "eval_arxiv_num_true_words": 86.6171875, "eval_arxiv_perplexity": 32.11084093576484, "eval_arxiv_pred_num_tokens": 125.6640625, "eval_arxiv_rouge_score": 0.16881770155932507, "eval_arxiv_runtime": 29.7356, "eval_arxiv_samples_per_second": 16.815, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.16748944934148888, "eval_arxiv_token_set_f1_sem": 0.008352639251181508, "eval_arxiv_token_set_precision": 0.11095107264645622, "eval_arxiv_token_set_recall": 0.40108541848624346, "eval_arxiv_true_num_tokens": 126.515625, "step": 425000 }, { "epoch": 24.858162250687254, "eval_python_code_alpaca_accuracy": 0.127078125, "eval_python_code_alpaca_bleu_score": 28.608269112873614, "eval_python_code_alpaca_bleu_score_sem": 1.626771311442487, "eval_python_code_alpaca_emb_cos_sim": 0.868765115737915, "eval_python_code_alpaca_emb_cos_sim_sem": 0.007859161123633385, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5181264877319336, "eval_python_code_alpaca_n_ngrams_match_1": 10.1171875, "eval_python_code_alpaca_n_ngrams_match_2": 5.6171875, "eval_python_code_alpaca_n_ngrams_match_3": 3.359375, "eval_python_code_alpaca_num_pred_words": 16.1953125, "eval_python_code_alpaca_num_true_words": 17.6796875, "eval_python_code_alpaca_perplexity": 4.563667094503246, "eval_python_code_alpaca_pred_num_tokens": 21.390625, "eval_python_code_alpaca_rouge_score": 0.5995391629424225, "eval_python_code_alpaca_runtime": 28.343, "eval_python_code_alpaca_samples_per_second": 17.641, "eval_python_code_alpaca_steps_per_second": 0.035, "eval_python_code_alpaca_token_set_f1": 0.6175818650469951, "eval_python_code_alpaca_token_set_f1_sem": 0.012705112509708516, "eval_python_code_alpaca_token_set_precision": 0.5986006417209437, "eval_python_code_alpaca_token_set_recall": 0.6447224148003919, "eval_python_code_alpaca_true_num_tokens": 22.8359375, "step": 425000 }, { "epoch": 24.858162250687254, "eval_wikibio_accuracy": 0.3663125, "eval_wikibio_bleu_score": 7.0444571490914525, "eval_wikibio_bleu_score_sem": 0.7129298422911187, "eval_wikibio_emb_cos_sim": 0.5979433655738831, "eval_wikibio_emb_cos_sim_sem": 0.024211062118411064, "eval_wikibio_emb_top1_equal": 0.8828125, "eval_wikibio_emb_top1_equal_sem": 0.02854125387966633, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.708489418029785, "eval_wikibio_n_ngrams_match_1": 14.3984375, "eval_wikibio_n_ngrams_match_2": 5.046875, "eval_wikibio_n_ngrams_match_3": 2.125, "eval_wikibio_num_pred_words": 50.359375, "eval_wikibio_num_true_words": 52.5390625, "eval_wikibio_perplexity": 15.006589700961795, "eval_wikibio_pred_num_tokens": 106.984375, "eval_wikibio_rouge_score": 0.28451935612635104, "eval_wikibio_runtime": 30.2721, "eval_wikibio_samples_per_second": 16.517, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.30293420360618406, "eval_wikibio_token_set_f1_sem": 0.013411409806396152, "eval_wikibio_token_set_precision": 0.26355381793357313, "eval_wikibio_token_set_recall": 0.4048941055673038, "eval_wikibio_true_num_tokens": 99.7421875, "step": 425000 }, { "epoch": 24.858162250687254, "eval_msmarco_accuracy": 0.39196875, "eval_msmarco_bleu_score": 17.361089463477867, "eval_msmarco_bleu_score_sem": 1.4948272966596583, "eval_msmarco_emb_cos_sim": 0.7861961126327515, "eval_msmarco_emb_cos_sim_sem": 0.01639672927558422, "eval_msmarco_emb_top1_equal": 0.9375, "eval_msmarco_emb_top1_equal_sem": 0.02147948183119297, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7608530521392822, "eval_msmarco_n_ngrams_match_1": 27.9296875, "eval_msmarco_n_ngrams_match_2": 13.0234375, "eval_msmarco_n_ngrams_match_3": 7.8125, "eval_msmarco_num_pred_words": 61.609375, "eval_msmarco_num_true_words": 63.40625, "eval_msmarco_perplexity": 5.81739782200721, "eval_msmarco_pred_num_tokens": 85.8828125, "eval_msmarco_rouge_score": 0.4270124452573344, "eval_msmarco_runtime": 25.5871, "eval_msmarco_samples_per_second": 19.541, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4631603294332162, "eval_msmarco_token_set_f1_sem": 0.014451689906202977, "eval_msmarco_token_set_precision": 0.4188994934761432, "eval_msmarco_token_set_recall": 0.5412411237655215, "eval_msmarco_true_num_tokens": 81.5859375, "step": 425000 }, { "epoch": 24.864011230040358, "grad_norm": 0.16269804537296295, "learning_rate": 0.001, "loss": 1.9039, "step": 425100 }, { "epoch": 24.86986020939346, "grad_norm": 0.21430301666259766, "learning_rate": 0.001, "loss": 1.8951, "step": 425200 }, { "epoch": 24.875709188746562, "grad_norm": 0.18654169142246246, "learning_rate": 0.001, "loss": 1.895, "step": 425300 }, { "epoch": 24.881558168099666, "grad_norm": 0.18337072432041168, "learning_rate": 0.001, "loss": 1.8981, "step": 425400 }, { "epoch": 24.88740714745277, "grad_norm": 0.24241721630096436, "learning_rate": 0.001, "loss": 1.8918, "step": 425500 }, { "epoch": 24.893256126805873, "grad_norm": 0.28486278653144836, "learning_rate": 0.001, "loss": 1.8941, "step": 425600 }, { "epoch": 24.899105106158974, "grad_norm": 0.19116191565990448, "learning_rate": 0.001, "loss": 1.8997, "step": 425700 }, { "epoch": 24.904954085512077, "grad_norm": 0.2354341447353363, "learning_rate": 0.001, "loss": 1.9029, "step": 425800 }, { "epoch": 24.91080306486518, "grad_norm": 0.2659619450569153, "learning_rate": 0.001, "loss": 1.8961, "step": 425900 }, { "epoch": 24.916652044218285, "grad_norm": 0.2807767391204834, "learning_rate": 0.001, "loss": 1.8929, "step": 426000 }, { "epoch": 24.922501023571385, "grad_norm": 0.18059101700782776, "learning_rate": 0.001, "loss": 1.9004, "step": 426100 }, { "epoch": 24.92835000292449, "grad_norm": 0.19236044585704803, "learning_rate": 0.001, "loss": 1.9034, "step": 426200 }, { "epoch": 24.934198982277593, "grad_norm": 0.23319900035858154, "learning_rate": 0.001, "loss": 1.9002, "step": 426300 }, { "epoch": 24.940047961630697, "grad_norm": 0.2817191183567047, "learning_rate": 0.001, "loss": 1.9004, "step": 426400 }, { "epoch": 24.945896940983797, "grad_norm": 0.28214114904403687, "learning_rate": 0.001, "loss": 1.8964, "step": 426500 }, { "epoch": 24.9517459203369, "grad_norm": 0.25598591566085815, "learning_rate": 0.001, "loss": 1.8991, "step": 426600 }, { "epoch": 24.957594899690005, "grad_norm": 0.20587122440338135, "learning_rate": 0.001, "loss": 1.9017, "step": 426700 }, { "epoch": 24.96344387904311, "grad_norm": 0.2059243619441986, "learning_rate": 0.001, "loss": 1.8975, "step": 426800 }, { "epoch": 24.96929285839621, "grad_norm": 0.2175225019454956, "learning_rate": 0.001, "loss": 1.8934, "step": 426900 }, { "epoch": 24.975141837749312, "grad_norm": 0.1833517849445343, "learning_rate": 0.001, "loss": 1.8959, "step": 427000 }, { "epoch": 24.980990817102416, "grad_norm": 0.22721457481384277, "learning_rate": 0.001, "loss": 1.8994, "step": 427100 }, { "epoch": 24.98683979645552, "grad_norm": 0.203566312789917, "learning_rate": 0.001, "loss": 1.8946, "step": 427200 }, { "epoch": 24.99268877580862, "grad_norm": 0.2732349932193756, "learning_rate": 0.001, "loss": 1.8984, "step": 427300 }, { "epoch": 24.998537755161724, "grad_norm": 0.19365213811397552, "learning_rate": 0.001, "loss": 1.8943, "step": 427400 }, { "epoch": 25.004386734514828, "grad_norm": 0.2581903636455536, "learning_rate": 0.001, "loss": 1.878, "step": 427500 }, { "epoch": 25.01023571386793, "grad_norm": 0.266510546207428, "learning_rate": 0.001, "loss": 1.8827, "step": 427600 }, { "epoch": 25.016084693221032, "grad_norm": 0.2392500340938568, "learning_rate": 0.001, "loss": 1.8826, "step": 427700 }, { "epoch": 25.021933672574136, "grad_norm": 0.28728535771369934, "learning_rate": 0.001, "loss": 1.8852, "step": 427800 }, { "epoch": 25.02778265192724, "grad_norm": 0.293486088514328, "learning_rate": 0.001, "loss": 1.8856, "step": 427900 }, { "epoch": 25.033631631280343, "grad_norm": 0.25225865840911865, "learning_rate": 0.001, "loss": 1.883, "step": 428000 }, { "epoch": 25.039480610633444, "grad_norm": 0.2255643755197525, "learning_rate": 0.001, "loss": 1.8813, "step": 428100 }, { "epoch": 25.045329589986547, "grad_norm": 0.2570464313030243, "learning_rate": 0.001, "loss": 1.8837, "step": 428200 }, { "epoch": 25.05117856933965, "grad_norm": 0.25589948892593384, "learning_rate": 0.001, "loss": 1.8831, "step": 428300 }, { "epoch": 25.05702754869275, "grad_norm": 0.2452911138534546, "learning_rate": 0.001, "loss": 1.886, "step": 428400 }, { "epoch": 25.062876528045855, "grad_norm": 0.24717159569263458, "learning_rate": 0.001, "loss": 1.89, "step": 428500 }, { "epoch": 25.06872550739896, "grad_norm": 0.21171869337558746, "learning_rate": 0.001, "loss": 1.8873, "step": 428600 }, { "epoch": 25.074574486752063, "grad_norm": 0.33870700001716614, "learning_rate": 0.001, "loss": 1.8826, "step": 428700 }, { "epoch": 25.080423466105163, "grad_norm": 0.25627774000167847, "learning_rate": 0.001, "loss": 1.8899, "step": 428800 }, { "epoch": 25.086272445458267, "grad_norm": 0.21858227252960205, "learning_rate": 0.001, "loss": 1.8839, "step": 428900 }, { "epoch": 25.09212142481137, "grad_norm": 0.28993403911590576, "learning_rate": 0.001, "loss": 1.8811, "step": 429000 }, { "epoch": 25.097970404164474, "grad_norm": 0.23478464782238007, "learning_rate": 0.001, "loss": 1.8885, "step": 429100 }, { "epoch": 25.103819383517575, "grad_norm": 0.3748364746570587, "learning_rate": 0.001, "loss": 1.89, "step": 429200 }, { "epoch": 25.10966836287068, "grad_norm": 0.3558204770088196, "learning_rate": 0.001, "loss": 1.8865, "step": 429300 }, { "epoch": 25.115517342223782, "grad_norm": 0.3421887159347534, "learning_rate": 0.001, "loss": 1.8855, "step": 429400 }, { "epoch": 25.121366321576886, "grad_norm": 0.28292521834373474, "learning_rate": 0.001, "loss": 1.8784, "step": 429500 }, { "epoch": 25.127215300929986, "grad_norm": 0.26537182927131653, "learning_rate": 0.001, "loss": 1.8832, "step": 429600 }, { "epoch": 25.13306428028309, "grad_norm": 0.34227296710014343, "learning_rate": 0.001, "loss": 1.8913, "step": 429700 }, { "epoch": 25.138913259636194, "grad_norm": 0.2513040602207184, "learning_rate": 0.001, "loss": 1.8849, "step": 429800 }, { "epoch": 25.144762238989298, "grad_norm": 0.2800808250904083, "learning_rate": 0.001, "loss": 1.8835, "step": 429900 }, { "epoch": 25.150611218342398, "grad_norm": 0.23869654536247253, "learning_rate": 0.001, "loss": 1.8845, "step": 430000 }, { "epoch": 25.156460197695502, "grad_norm": 0.24574072659015656, "learning_rate": 0.001, "loss": 1.8933, "step": 430100 }, { "epoch": 25.162309177048606, "grad_norm": 0.2401445060968399, "learning_rate": 0.001, "loss": 1.8871, "step": 430200 }, { "epoch": 25.16815815640171, "grad_norm": 0.25949275493621826, "learning_rate": 0.001, "loss": 1.885, "step": 430300 }, { "epoch": 25.17400713575481, "grad_norm": 0.25003668665885925, "learning_rate": 0.001, "loss": 1.8871, "step": 430400 }, { "epoch": 25.179856115107913, "grad_norm": 0.24501961469650269, "learning_rate": 0.001, "loss": 1.8896, "step": 430500 }, { "epoch": 25.185705094461017, "grad_norm": 0.2752419412136078, "learning_rate": 0.001, "loss": 1.897, "step": 430600 }, { "epoch": 25.19155407381412, "grad_norm": 0.36242836713790894, "learning_rate": 0.001, "loss": 1.8918, "step": 430700 }, { "epoch": 25.19740305316722, "grad_norm": 0.29344016313552856, "learning_rate": 0.001, "loss": 1.8842, "step": 430800 }, { "epoch": 25.203252032520325, "grad_norm": 0.28484728932380676, "learning_rate": 0.001, "loss": 1.8866, "step": 430900 }, { "epoch": 25.20910101187343, "grad_norm": 0.3243735134601593, "learning_rate": 0.001, "loss": 1.89, "step": 431000 }, { "epoch": 25.214949991226533, "grad_norm": 0.2863173484802246, "learning_rate": 0.001, "loss": 1.8918, "step": 431100 }, { "epoch": 25.220798970579633, "grad_norm": 0.26828402280807495, "learning_rate": 0.001, "loss": 1.8907, "step": 431200 }, { "epoch": 25.226647949932737, "grad_norm": 0.2591820955276489, "learning_rate": 0.001, "loss": 1.8873, "step": 431300 }, { "epoch": 25.23249692928584, "grad_norm": 0.29060035943984985, "learning_rate": 0.001, "loss": 1.8876, "step": 431400 }, { "epoch": 25.23834590863894, "grad_norm": 0.32413962483406067, "learning_rate": 0.001, "loss": 1.8888, "step": 431500 }, { "epoch": 25.244194887992045, "grad_norm": 0.24581310153007507, "learning_rate": 0.001, "loss": 1.8928, "step": 431600 }, { "epoch": 25.25004386734515, "grad_norm": 0.2869788408279419, "learning_rate": 0.001, "loss": 1.8842, "step": 431700 }, { "epoch": 25.255892846698252, "grad_norm": 0.24145780503749847, "learning_rate": 0.001, "loss": 1.8885, "step": 431800 }, { "epoch": 25.261741826051352, "grad_norm": 0.35251525044441223, "learning_rate": 0.001, "loss": 1.8873, "step": 431900 }, { "epoch": 25.267590805404456, "grad_norm": 0.25084835290908813, "learning_rate": 0.001, "loss": 1.8855, "step": 432000 }, { "epoch": 25.27343978475756, "grad_norm": 0.2023535966873169, "learning_rate": 0.001, "loss": 1.8928, "step": 432100 }, { "epoch": 25.279288764110664, "grad_norm": 0.2473735511302948, "learning_rate": 0.001, "loss": 1.8911, "step": 432200 }, { "epoch": 25.285137743463764, "grad_norm": 0.26011696457862854, "learning_rate": 0.001, "loss": 1.8913, "step": 432300 }, { "epoch": 25.290986722816868, "grad_norm": 0.2674715518951416, "learning_rate": 0.001, "loss": 1.8947, "step": 432400 }, { "epoch": 25.29683570216997, "grad_norm": 0.29785293340682983, "learning_rate": 0.001, "loss": 1.8886, "step": 432500 }, { "epoch": 25.302684681523075, "grad_norm": 0.3033718466758728, "learning_rate": 0.001, "loss": 1.8852, "step": 432600 }, { "epoch": 25.308533660876176, "grad_norm": 0.26272034645080566, "learning_rate": 0.001, "loss": 1.8923, "step": 432700 }, { "epoch": 25.31438264022928, "grad_norm": 0.27607282996177673, "learning_rate": 0.001, "loss": 1.8879, "step": 432800 }, { "epoch": 25.320231619582383, "grad_norm": 0.2782912254333496, "learning_rate": 0.001, "loss": 1.8906, "step": 432900 }, { "epoch": 25.326080598935487, "grad_norm": 0.2298450917005539, "learning_rate": 0.001, "loss": 1.8846, "step": 433000 }, { "epoch": 25.331929578288587, "grad_norm": 0.2167215645313263, "learning_rate": 0.001, "loss": 1.8863, "step": 433100 }, { "epoch": 25.33777855764169, "grad_norm": 0.23596693575382233, "learning_rate": 0.001, "loss": 1.8961, "step": 433200 }, { "epoch": 25.343627536994795, "grad_norm": 0.27746888995170593, "learning_rate": 0.001, "loss": 1.8918, "step": 433300 }, { "epoch": 25.3494765163479, "grad_norm": 0.2777402102947235, "learning_rate": 0.001, "loss": 1.8808, "step": 433400 }, { "epoch": 25.355325495701, "grad_norm": 0.31973615288734436, "learning_rate": 0.001, "loss": 1.8897, "step": 433500 }, { "epoch": 25.361174475054103, "grad_norm": 0.33703023195266724, "learning_rate": 0.001, "loss": 1.8876, "step": 433600 }, { "epoch": 25.367023454407207, "grad_norm": 0.22590014338493347, "learning_rate": 0.001, "loss": 1.8852, "step": 433700 }, { "epoch": 25.37287243376031, "grad_norm": 0.28182706236839294, "learning_rate": 0.001, "loss": 1.8871, "step": 433800 }, { "epoch": 25.37872141311341, "grad_norm": 0.3343202471733093, "learning_rate": 0.001, "loss": 1.8855, "step": 433900 }, { "epoch": 25.384570392466514, "grad_norm": 0.2889949679374695, "learning_rate": 0.001, "loss": 1.888, "step": 434000 }, { "epoch": 25.39041937181962, "grad_norm": 0.24908791482448578, "learning_rate": 0.001, "loss": 1.8869, "step": 434100 }, { "epoch": 25.396268351172722, "grad_norm": 0.2449014037847519, "learning_rate": 0.001, "loss": 1.8913, "step": 434200 }, { "epoch": 25.402117330525822, "grad_norm": 0.2885642647743225, "learning_rate": 0.001, "loss": 1.8893, "step": 434300 }, { "epoch": 25.407966309878926, "grad_norm": 0.2222575843334198, "learning_rate": 0.001, "loss": 1.892, "step": 434400 }, { "epoch": 25.41381528923203, "grad_norm": 0.25061243772506714, "learning_rate": 0.001, "loss": 1.8867, "step": 434500 }, { "epoch": 25.41966426858513, "grad_norm": 0.27615511417388916, "learning_rate": 0.001, "loss": 1.8922, "step": 434600 }, { "epoch": 25.425513247938234, "grad_norm": 0.2759777009487152, "learning_rate": 0.001, "loss": 1.8905, "step": 434700 }, { "epoch": 25.431362227291338, "grad_norm": 0.25675448775291443, "learning_rate": 0.001, "loss": 1.8868, "step": 434800 }, { "epoch": 25.43721120664444, "grad_norm": 0.3139793574810028, "learning_rate": 0.001, "loss": 1.892, "step": 434900 }, { "epoch": 25.443060185997542, "grad_norm": 0.29568272829055786, "learning_rate": 0.001, "loss": 1.8981, "step": 435000 }, { "epoch": 25.448909165350646, "grad_norm": 0.25805774331092834, "learning_rate": 0.001, "loss": 1.8917, "step": 435100 }, { "epoch": 25.45475814470375, "grad_norm": 0.3304448425769806, "learning_rate": 0.001, "loss": 1.8936, "step": 435200 }, { "epoch": 25.460607124056853, "grad_norm": 0.2559499740600586, "learning_rate": 0.001, "loss": 1.8875, "step": 435300 }, { "epoch": 25.466456103409953, "grad_norm": 0.2646504342556, "learning_rate": 0.001, "loss": 1.8915, "step": 435400 }, { "epoch": 25.472305082763057, "grad_norm": 0.3689253032207489, "learning_rate": 0.001, "loss": 1.8924, "step": 435500 }, { "epoch": 25.47815406211616, "grad_norm": 0.23088812828063965, "learning_rate": 0.001, "loss": 1.8942, "step": 435600 }, { "epoch": 25.484003041469265, "grad_norm": 0.19559407234191895, "learning_rate": 0.001, "loss": 1.8913, "step": 435700 }, { "epoch": 25.489852020822365, "grad_norm": 0.23438569903373718, "learning_rate": 0.001, "loss": 1.8898, "step": 435800 }, { "epoch": 25.49570100017547, "grad_norm": 0.25461679697036743, "learning_rate": 0.001, "loss": 1.8949, "step": 435900 }, { "epoch": 25.501549979528573, "grad_norm": 0.2752688229084015, "learning_rate": 0.001, "loss": 1.8941, "step": 436000 }, { "epoch": 25.507398958881677, "grad_norm": 0.25308147072792053, "learning_rate": 0.001, "loss": 1.8941, "step": 436100 }, { "epoch": 25.513247938234777, "grad_norm": 0.23266230523586273, "learning_rate": 0.001, "loss": 1.8864, "step": 436200 }, { "epoch": 25.51909691758788, "grad_norm": 0.4143013060092926, "learning_rate": 0.001, "loss": 1.8928, "step": 436300 }, { "epoch": 25.524945896940984, "grad_norm": 0.26300886273384094, "learning_rate": 0.001, "loss": 1.8914, "step": 436400 }, { "epoch": 25.530794876294088, "grad_norm": 0.2694845199584961, "learning_rate": 0.001, "loss": 1.8927, "step": 436500 }, { "epoch": 25.53664385564719, "grad_norm": 0.22843174636363983, "learning_rate": 0.001, "loss": 1.8865, "step": 436600 }, { "epoch": 25.542492835000292, "grad_norm": 0.24967257678508759, "learning_rate": 0.001, "loss": 1.8857, "step": 436700 }, { "epoch": 25.548341814353396, "grad_norm": 0.24330197274684906, "learning_rate": 0.001, "loss": 1.8871, "step": 436800 }, { "epoch": 25.5541907937065, "grad_norm": 0.27913838624954224, "learning_rate": 0.001, "loss": 1.8916, "step": 436900 }, { "epoch": 25.5600397730596, "grad_norm": 0.27030426263809204, "learning_rate": 0.001, "loss": 1.8892, "step": 437000 }, { "epoch": 25.565888752412704, "grad_norm": 0.32775208353996277, "learning_rate": 0.001, "loss": 1.8903, "step": 437100 }, { "epoch": 25.571737731765808, "grad_norm": 0.2545686662197113, "learning_rate": 0.001, "loss": 1.8866, "step": 437200 }, { "epoch": 25.57758671111891, "grad_norm": 0.24363167583942413, "learning_rate": 0.001, "loss": 1.8944, "step": 437300 }, { "epoch": 25.58343569047201, "grad_norm": 0.19375360012054443, "learning_rate": 0.001, "loss": 1.8947, "step": 437400 }, { "epoch": 25.589284669825116, "grad_norm": 0.23747022449970245, "learning_rate": 0.001, "loss": 1.8909, "step": 437500 }, { "epoch": 25.59513364917822, "grad_norm": 0.2299172282218933, "learning_rate": 0.001, "loss": 1.8909, "step": 437600 }, { "epoch": 25.60098262853132, "grad_norm": 0.32293882966041565, "learning_rate": 0.001, "loss": 1.8914, "step": 437700 }, { "epoch": 25.606831607884423, "grad_norm": 0.31616339087486267, "learning_rate": 0.001, "loss": 1.887, "step": 437800 }, { "epoch": 25.612680587237527, "grad_norm": 0.3117380142211914, "learning_rate": 0.001, "loss": 1.8921, "step": 437900 }, { "epoch": 25.61852956659063, "grad_norm": 0.22447460889816284, "learning_rate": 0.001, "loss": 1.8852, "step": 438000 }, { "epoch": 25.62437854594373, "grad_norm": 0.2733204960823059, "learning_rate": 0.001, "loss": 1.8956, "step": 438100 }, { "epoch": 25.630227525296835, "grad_norm": 0.3116430640220642, "learning_rate": 0.001, "loss": 1.8869, "step": 438200 }, { "epoch": 25.63607650464994, "grad_norm": 0.24857600033283234, "learning_rate": 0.001, "loss": 1.8881, "step": 438300 }, { "epoch": 25.641925484003043, "grad_norm": 0.2188650518655777, "learning_rate": 0.001, "loss": 1.8897, "step": 438400 }, { "epoch": 25.647774463356143, "grad_norm": 0.23192693293094635, "learning_rate": 0.001, "loss": 1.8921, "step": 438500 }, { "epoch": 25.653623442709247, "grad_norm": 0.2838289141654968, "learning_rate": 0.001, "loss": 1.893, "step": 438600 }, { "epoch": 25.65947242206235, "grad_norm": 0.2847062647342682, "learning_rate": 0.001, "loss": 1.8884, "step": 438700 }, { "epoch": 25.665321401415454, "grad_norm": 0.3384210169315338, "learning_rate": 0.001, "loss": 1.8908, "step": 438800 }, { "epoch": 25.671170380768555, "grad_norm": 0.24670441448688507, "learning_rate": 0.001, "loss": 1.8896, "step": 438900 }, { "epoch": 25.67701936012166, "grad_norm": 0.24692963063716888, "learning_rate": 0.001, "loss": 1.8893, "step": 439000 }, { "epoch": 25.682868339474762, "grad_norm": 0.23879338800907135, "learning_rate": 0.001, "loss": 1.8948, "step": 439100 }, { "epoch": 25.688717318827866, "grad_norm": 0.2791806161403656, "learning_rate": 0.001, "loss": 1.8943, "step": 439200 }, { "epoch": 25.694566298180966, "grad_norm": 0.3167051374912262, "learning_rate": 0.001, "loss": 1.8928, "step": 439300 }, { "epoch": 25.70041527753407, "grad_norm": 0.2414441853761673, "learning_rate": 0.001, "loss": 1.8909, "step": 439400 }, { "epoch": 25.706264256887174, "grad_norm": 0.2721966803073883, "learning_rate": 0.001, "loss": 1.894, "step": 439500 }, { "epoch": 25.712113236240278, "grad_norm": 0.2724594473838806, "learning_rate": 0.001, "loss": 1.8907, "step": 439600 }, { "epoch": 25.717962215593378, "grad_norm": 0.29518380761146545, "learning_rate": 0.001, "loss": 1.8978, "step": 439700 }, { "epoch": 25.72381119494648, "grad_norm": 0.2218954712152481, "learning_rate": 0.001, "loss": 1.9011, "step": 439800 }, { "epoch": 25.729660174299585, "grad_norm": 0.4588788151741028, "learning_rate": 0.001, "loss": 1.8897, "step": 439900 }, { "epoch": 25.73550915365269, "grad_norm": 0.2870415449142456, "learning_rate": 0.001, "loss": 1.8945, "step": 440000 }, { "epoch": 25.74135813300579, "grad_norm": 0.2902059257030487, "learning_rate": 0.001, "loss": 1.8982, "step": 440100 }, { "epoch": 25.747207112358893, "grad_norm": 0.2230147421360016, "learning_rate": 0.001, "loss": 1.8923, "step": 440200 }, { "epoch": 25.753056091711997, "grad_norm": 0.39702939987182617, "learning_rate": 0.001, "loss": 1.8958, "step": 440300 }, { "epoch": 25.7589050710651, "grad_norm": 0.29575419425964355, "learning_rate": 0.001, "loss": 1.8973, "step": 440400 }, { "epoch": 25.7647540504182, "grad_norm": 0.19493676722049713, "learning_rate": 0.001, "loss": 1.8907, "step": 440500 }, { "epoch": 25.770603029771305, "grad_norm": 0.22705990076065063, "learning_rate": 0.001, "loss": 1.892, "step": 440600 }, { "epoch": 25.77645200912441, "grad_norm": 0.23560461401939392, "learning_rate": 0.001, "loss": 1.8899, "step": 440700 }, { "epoch": 25.78230098847751, "grad_norm": 0.2833937704563141, "learning_rate": 0.001, "loss": 1.8992, "step": 440800 }, { "epoch": 25.788149967830613, "grad_norm": 0.24372470378875732, "learning_rate": 0.001, "loss": 1.8893, "step": 440900 }, { "epoch": 25.793998947183717, "grad_norm": 0.29759183526039124, "learning_rate": 0.001, "loss": 1.8936, "step": 441000 }, { "epoch": 25.79984792653682, "grad_norm": 0.268837034702301, "learning_rate": 0.001, "loss": 1.8941, "step": 441100 }, { "epoch": 25.80569690588992, "grad_norm": 0.29745855927467346, "learning_rate": 0.001, "loss": 1.8897, "step": 441200 }, { "epoch": 25.811545885243024, "grad_norm": 0.31730860471725464, "learning_rate": 0.001, "loss": 1.8964, "step": 441300 }, { "epoch": 25.817394864596128, "grad_norm": 0.2430170476436615, "learning_rate": 0.001, "loss": 1.8939, "step": 441400 }, { "epoch": 25.823243843949232, "grad_norm": 0.24171964824199677, "learning_rate": 0.001, "loss": 1.898, "step": 441500 }, { "epoch": 25.829092823302332, "grad_norm": 0.30373936891555786, "learning_rate": 0.001, "loss": 1.8941, "step": 441600 }, { "epoch": 25.834941802655436, "grad_norm": 0.2712891399860382, "learning_rate": 0.001, "loss": 1.9, "step": 441700 }, { "epoch": 25.84079078200854, "grad_norm": 0.25104838609695435, "learning_rate": 0.001, "loss": 1.8894, "step": 441800 }, { "epoch": 25.846639761361644, "grad_norm": 0.2831248342990875, "learning_rate": 0.001, "loss": 1.8924, "step": 441900 }, { "epoch": 25.852488740714744, "grad_norm": 0.27787044644355774, "learning_rate": 0.001, "loss": 1.892, "step": 442000 }, { "epoch": 25.858337720067848, "grad_norm": 0.28419992327690125, "learning_rate": 0.001, "loss": 1.8981, "step": 442100 }, { "epoch": 25.86418669942095, "grad_norm": 0.32282570004463196, "learning_rate": 0.001, "loss": 1.8943, "step": 442200 }, { "epoch": 25.870035678774055, "grad_norm": 0.2882575988769531, "learning_rate": 0.001, "loss": 1.8921, "step": 442300 }, { "epoch": 25.875884658127156, "grad_norm": 0.2761583924293518, "learning_rate": 0.001, "loss": 1.8863, "step": 442400 }, { "epoch": 25.88173363748026, "grad_norm": 0.2508326768875122, "learning_rate": 0.001, "loss": 1.892, "step": 442500 }, { "epoch": 25.887582616833363, "grad_norm": 0.27565521001815796, "learning_rate": 0.001, "loss": 1.8993, "step": 442600 }, { "epoch": 25.893431596186467, "grad_norm": 0.303070068359375, "learning_rate": 0.001, "loss": 1.893, "step": 442700 }, { "epoch": 25.899280575539567, "grad_norm": 0.30720117688179016, "learning_rate": 0.001, "loss": 1.8885, "step": 442800 }, { "epoch": 25.90512955489267, "grad_norm": 0.27738335728645325, "learning_rate": 0.001, "loss": 1.8912, "step": 442900 }, { "epoch": 25.910978534245775, "grad_norm": 0.3214915692806244, "learning_rate": 0.001, "loss": 1.8925, "step": 443000 }, { "epoch": 25.91682751359888, "grad_norm": 0.31758469343185425, "learning_rate": 0.001, "loss": 1.8935, "step": 443100 }, { "epoch": 25.92267649295198, "grad_norm": 0.3799380660057068, "learning_rate": 0.001, "loss": 1.8927, "step": 443200 }, { "epoch": 25.928525472305083, "grad_norm": 0.21105718612670898, "learning_rate": 0.001, "loss": 1.896, "step": 443300 }, { "epoch": 25.934374451658186, "grad_norm": 0.2618502676486969, "learning_rate": 0.001, "loss": 1.8909, "step": 443400 }, { "epoch": 25.94022343101129, "grad_norm": 0.28842031955718994, "learning_rate": 0.001, "loss": 1.8965, "step": 443500 }, { "epoch": 25.94607241036439, "grad_norm": 0.28630325198173523, "learning_rate": 0.001, "loss": 1.8927, "step": 443600 }, { "epoch": 25.951921389717494, "grad_norm": 0.2571265697479248, "learning_rate": 0.001, "loss": 1.8921, "step": 443700 }, { "epoch": 25.957770369070598, "grad_norm": 0.2188812792301178, "learning_rate": 0.001, "loss": 1.893, "step": 443800 }, { "epoch": 25.9636193484237, "grad_norm": 0.3098853826522827, "learning_rate": 0.001, "loss": 1.8886, "step": 443900 }, { "epoch": 25.969468327776802, "grad_norm": 0.301552414894104, "learning_rate": 0.001, "loss": 1.8927, "step": 444000 }, { "epoch": 25.975317307129906, "grad_norm": 0.30158162117004395, "learning_rate": 0.001, "loss": 1.8922, "step": 444100 }, { "epoch": 25.98116628648301, "grad_norm": 0.2711678445339203, "learning_rate": 0.001, "loss": 1.8933, "step": 444200 }, { "epoch": 25.98701526583611, "grad_norm": 0.2562733590602875, "learning_rate": 0.001, "loss": 1.8967, "step": 444300 }, { "epoch": 25.992864245189214, "grad_norm": 0.23725634813308716, "learning_rate": 0.001, "loss": 1.8904, "step": 444400 }, { "epoch": 25.998713224542318, "grad_norm": 0.2148614376783371, "learning_rate": 0.001, "loss": 1.8869, "step": 444500 }, { "epoch": 26.00456220389542, "grad_norm": 0.26463326811790466, "learning_rate": 0.001, "loss": 1.8827, "step": 444600 }, { "epoch": 26.01041118324852, "grad_norm": 0.21846884489059448, "learning_rate": 0.001, "loss": 1.8767, "step": 444700 }, { "epoch": 26.016260162601625, "grad_norm": 0.1843477189540863, "learning_rate": 0.001, "loss": 1.8796, "step": 444800 }, { "epoch": 26.02210914195473, "grad_norm": 0.17262870073318481, "learning_rate": 0.001, "loss": 1.8876, "step": 444900 }, { "epoch": 26.027958121307833, "grad_norm": 0.18644694983959198, "learning_rate": 0.001, "loss": 1.8778, "step": 445000 }, { "epoch": 26.033807100660933, "grad_norm": 0.18457579612731934, "learning_rate": 0.001, "loss": 1.8797, "step": 445100 }, { "epoch": 26.039656080014037, "grad_norm": 0.22818782925605774, "learning_rate": 0.001, "loss": 1.8861, "step": 445200 }, { "epoch": 26.04550505936714, "grad_norm": 0.19028021395206451, "learning_rate": 0.001, "loss": 1.8817, "step": 445300 }, { "epoch": 26.051354038720245, "grad_norm": 0.21882107853889465, "learning_rate": 0.001, "loss": 1.8793, "step": 445400 }, { "epoch": 26.057203018073345, "grad_norm": 0.20684464275836945, "learning_rate": 0.001, "loss": 1.8758, "step": 445500 }, { "epoch": 26.06305199742645, "grad_norm": 0.16401006281375885, "learning_rate": 0.001, "loss": 1.8851, "step": 445600 }, { "epoch": 26.068900976779553, "grad_norm": 0.23424673080444336, "learning_rate": 0.001, "loss": 1.8854, "step": 445700 }, { "epoch": 26.074749956132656, "grad_norm": 0.16517002880573273, "learning_rate": 0.001, "loss": 1.8818, "step": 445800 }, { "epoch": 26.080598935485757, "grad_norm": 0.2127421349287033, "learning_rate": 0.001, "loss": 1.8774, "step": 445900 }, { "epoch": 26.08644791483886, "grad_norm": 0.1768820881843567, "learning_rate": 0.001, "loss": 1.8784, "step": 446000 }, { "epoch": 26.092296894191964, "grad_norm": 0.19866521656513214, "learning_rate": 0.001, "loss": 1.879, "step": 446100 }, { "epoch": 26.098145873545068, "grad_norm": 0.22767195105552673, "learning_rate": 0.001, "loss": 1.8812, "step": 446200 }, { "epoch": 26.10399485289817, "grad_norm": 0.17252139747142792, "learning_rate": 0.001, "loss": 1.8811, "step": 446300 }, { "epoch": 26.109843832251272, "grad_norm": 0.1918560266494751, "learning_rate": 0.001, "loss": 1.8828, "step": 446400 }, { "epoch": 26.115692811604376, "grad_norm": 0.212545707821846, "learning_rate": 0.001, "loss": 1.876, "step": 446500 }, { "epoch": 26.12154179095748, "grad_norm": 0.18814383447170258, "learning_rate": 0.001, "loss": 1.8809, "step": 446600 }, { "epoch": 26.12739077031058, "grad_norm": 0.2513444125652313, "learning_rate": 0.001, "loss": 1.886, "step": 446700 }, { "epoch": 26.133239749663684, "grad_norm": 0.1647179126739502, "learning_rate": 0.001, "loss": 1.8837, "step": 446800 }, { "epoch": 26.139088729016787, "grad_norm": 0.21671006083488464, "learning_rate": 0.001, "loss": 1.8832, "step": 446900 }, { "epoch": 26.144937708369888, "grad_norm": 0.13474872708320618, "learning_rate": 0.001, "loss": 1.8795, "step": 447000 }, { "epoch": 26.15078668772299, "grad_norm": 0.20258422195911407, "learning_rate": 0.001, "loss": 1.8762, "step": 447100 }, { "epoch": 26.156635667076095, "grad_norm": 0.18043512105941772, "learning_rate": 0.001, "loss": 1.8837, "step": 447200 }, { "epoch": 26.1624846464292, "grad_norm": 0.18467997014522552, "learning_rate": 0.001, "loss": 1.881, "step": 447300 }, { "epoch": 26.1683336257823, "grad_norm": 0.309187114238739, "learning_rate": 0.001, "loss": 1.8817, "step": 447400 }, { "epoch": 26.174182605135403, "grad_norm": 0.22647197544574738, "learning_rate": 0.001, "loss": 1.884, "step": 447500 }, { "epoch": 26.180031584488507, "grad_norm": 0.19629113376140594, "learning_rate": 0.001, "loss": 1.8819, "step": 447600 }, { "epoch": 26.18588056384161, "grad_norm": 0.18572165071964264, "learning_rate": 0.001, "loss": 1.8795, "step": 447700 }, { "epoch": 26.19172954319471, "grad_norm": 0.16844718158245087, "learning_rate": 0.001, "loss": 1.8807, "step": 447800 }, { "epoch": 26.197578522547815, "grad_norm": 0.18692925572395325, "learning_rate": 0.001, "loss": 1.8805, "step": 447900 }, { "epoch": 26.20342750190092, "grad_norm": 0.2638774812221527, "learning_rate": 0.001, "loss": 1.8803, "step": 448000 }, { "epoch": 26.209276481254022, "grad_norm": 0.23028087615966797, "learning_rate": 0.001, "loss": 1.8771, "step": 448100 }, { "epoch": 26.215125460607123, "grad_norm": 0.16010615229606628, "learning_rate": 0.001, "loss": 1.8835, "step": 448200 }, { "epoch": 26.220974439960226, "grad_norm": 0.24764874577522278, "learning_rate": 0.001, "loss": 1.8807, "step": 448300 }, { "epoch": 26.22682341931333, "grad_norm": 0.20322860777378082, "learning_rate": 0.001, "loss": 1.8815, "step": 448400 }, { "epoch": 26.232672398666434, "grad_norm": 0.19962003827095032, "learning_rate": 0.001, "loss": 1.8799, "step": 448500 }, { "epoch": 26.238521378019534, "grad_norm": 0.18318989872932434, "learning_rate": 0.001, "loss": 1.882, "step": 448600 }, { "epoch": 26.244370357372638, "grad_norm": 0.15638509392738342, "learning_rate": 0.001, "loss": 1.8847, "step": 448700 }, { "epoch": 26.250219336725742, "grad_norm": 0.14533057808876038, "learning_rate": 0.001, "loss": 1.877, "step": 448800 }, { "epoch": 26.256068316078846, "grad_norm": 0.24838170409202576, "learning_rate": 0.001, "loss": 1.8858, "step": 448900 }, { "epoch": 26.261917295431946, "grad_norm": 0.23880021274089813, "learning_rate": 0.001, "loss": 1.8902, "step": 449000 }, { "epoch": 26.26776627478505, "grad_norm": 0.18541228771209717, "learning_rate": 0.001, "loss": 1.8818, "step": 449100 }, { "epoch": 26.273615254138154, "grad_norm": 0.24619653820991516, "learning_rate": 0.001, "loss": 1.8784, "step": 449200 }, { "epoch": 26.279464233491257, "grad_norm": 0.2561825215816498, "learning_rate": 0.001, "loss": 1.8793, "step": 449300 }, { "epoch": 26.285313212844358, "grad_norm": 0.23919256031513214, "learning_rate": 0.001, "loss": 1.8829, "step": 449400 }, { "epoch": 26.29116219219746, "grad_norm": 0.21281640231609344, "learning_rate": 0.001, "loss": 1.8861, "step": 449500 }, { "epoch": 26.297011171550565, "grad_norm": 0.29706257581710815, "learning_rate": 0.001, "loss": 1.8819, "step": 449600 }, { "epoch": 26.30286015090367, "grad_norm": 0.23857882618904114, "learning_rate": 0.001, "loss": 1.8862, "step": 449700 }, { "epoch": 26.30870913025677, "grad_norm": 0.1974320262670517, "learning_rate": 0.001, "loss": 1.894, "step": 449800 }, { "epoch": 26.314558109609873, "grad_norm": 0.20791174471378326, "learning_rate": 0.001, "loss": 1.8851, "step": 449900 }, { "epoch": 26.320407088962977, "grad_norm": 0.23163002729415894, "learning_rate": 0.001, "loss": 1.8761, "step": 450000 }, { "epoch": 26.320407088962977, "eval_ag_news_accuracy": 0.23978125, "eval_ag_news_bleu_score": 7.653890315089161, "eval_ag_news_bleu_score_sem": 0.5982593233610125, "eval_ag_news_emb_cos_sim": 0.721878170967102, "eval_ag_news_emb_cos_sim_sem": 0.013287859968841076, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.764805555343628, "eval_ag_news_n_ngrams_match_1": 14.3828125, "eval_ag_news_n_ngrams_match_2": 4.15625, "eval_ag_news_n_ngrams_match_3": 1.6171875, "eval_ag_news_num_pred_words": 45.1328125, "eval_ag_news_num_true_words": 45.4296875, "eval_ag_news_perplexity": 15.875952696302988, "eval_ag_news_pred_num_tokens": 67.8671875, "eval_ag_news_rouge_score": 0.30176888850472494, "eval_ag_news_runtime": 56.9701, "eval_ag_news_samples_per_second": 8.777, "eval_ag_news_steps_per_second": 0.018, "eval_ag_news_token_set_f1": 0.3403211138584824, "eval_ag_news_token_set_f1_sem": 0.009443589897536681, "eval_ag_news_token_set_precision": 0.31663905001255727, "eval_ag_news_token_set_recall": 0.3798645092409226, "eval_ag_news_true_num_tokens": 62.2265625, "step": 450000 }, { "epoch": 26.320407088962977, "eval_anthropic_toxic_prompts_accuracy": 0.100125, "eval_anthropic_toxic_prompts_bleu_score": 38.17486575845498, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.416142638226347, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8742340803146362, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.00896987970918417, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1015625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02680456515850638, "eval_anthropic_toxic_prompts_loss": 1.2635115385055542, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.5, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.796875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.6640625, "eval_anthropic_toxic_prompts_num_pred_words": 15.7890625, "eval_anthropic_toxic_prompts_num_true_words": 15.78125, "eval_anthropic_toxic_prompts_perplexity": 3.537822901945601, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.953125, "eval_anthropic_toxic_prompts_rouge_score": 0.6586082114469457, "eval_anthropic_toxic_prompts_runtime": 29.0884, "eval_anthropic_toxic_prompts_samples_per_second": 17.189, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6825578093692091, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017687993533211672, "eval_anthropic_toxic_prompts_token_set_precision": 0.6797754467679156, "eval_anthropic_toxic_prompts_token_set_recall": 0.6921495447276909, "eval_anthropic_toxic_prompts_true_num_tokens": 19.1640625, "step": 450000 }, { "epoch": 26.320407088962977, "eval_arxiv_accuracy": 0.373390625, "eval_arxiv_bleu_score": 1.6614548509385847, "eval_arxiv_bleu_score_sem": 0.16249605064178776, "eval_arxiv_emb_cos_sim": 0.46742361783981323, "eval_arxiv_emb_cos_sim_sem": 0.018515074625611305, "eval_arxiv_emb_top1_equal": 0.9140625, "eval_arxiv_emb_top1_equal_sem": 0.024870097637176514, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.456601619720459, "eval_arxiv_n_ngrams_match_1": 12.0, "eval_arxiv_n_ngrams_match_2": 2.25, "eval_arxiv_n_ngrams_match_3": 0.5078125, "eval_arxiv_num_pred_words": 52.140625, "eval_arxiv_num_true_words": 84.4609375, "eval_arxiv_perplexity": 31.709033848125973, "eval_arxiv_pred_num_tokens": 124.703125, "eval_arxiv_rouge_score": 0.16308507859364235, "eval_arxiv_runtime": 29.9818, "eval_arxiv_samples_per_second": 16.677, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.17130134316790852, "eval_arxiv_token_set_f1_sem": 0.009105033254642764, "eval_arxiv_token_set_precision": 0.11671855608671093, "eval_arxiv_token_set_recall": 0.4305086008018433, "eval_arxiv_true_num_tokens": 124.859375, "step": 450000 }, { "epoch": 26.320407088962977, "eval_python_code_alpaca_accuracy": 0.132234375, "eval_python_code_alpaca_bleu_score": 29.153130059983333, "eval_python_code_alpaca_bleu_score_sem": 1.6719277127440173, "eval_python_code_alpaca_emb_cos_sim": 0.8682206273078918, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009094567969441414, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.4869736433029175, "eval_python_code_alpaca_n_ngrams_match_1": 10.9296875, "eval_python_code_alpaca_n_ngrams_match_2": 5.890625, "eval_python_code_alpaca_n_ngrams_match_3": 3.4765625, "eval_python_code_alpaca_num_pred_words": 18.78125, "eval_python_code_alpaca_num_true_words": 19.484375, "eval_python_code_alpaca_perplexity": 4.423687584094357, "eval_python_code_alpaca_pred_num_tokens": 24.984375, "eval_python_code_alpaca_rouge_score": 0.6037704955682428, "eval_python_code_alpaca_runtime": 29.2389, "eval_python_code_alpaca_samples_per_second": 17.101, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.621358234063153, "eval_python_code_alpaca_token_set_f1_sem": 0.013491428773877392, "eval_python_code_alpaca_token_set_precision": 0.608651749382268, "eval_python_code_alpaca_token_set_recall": 0.6403435030711766, "eval_python_code_alpaca_true_num_tokens": 24.5703125, "step": 450000 }, { "epoch": 26.320407088962977, "eval_wikibio_accuracy": 0.356109375, "eval_wikibio_bleu_score": 8.011223668377372, "eval_wikibio_bleu_score_sem": 0.7287459677816382, "eval_wikibio_emb_cos_sim": 0.5943806171417236, "eval_wikibio_emb_cos_sim_sem": 0.022995809093117714, "eval_wikibio_emb_top1_equal": 0.921875, "eval_wikibio_emb_top1_equal_sem": 0.023813825100660324, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.746593713760376, "eval_wikibio_n_ngrams_match_1": 15.34375, "eval_wikibio_n_ngrams_match_2": 5.453125, "eval_wikibio_n_ngrams_match_3": 2.2421875, "eval_wikibio_num_pred_words": 52.90625, "eval_wikibio_num_true_words": 51.8359375, "eval_wikibio_perplexity": 15.589439248552852, "eval_wikibio_pred_num_tokens": 100.7265625, "eval_wikibio_rouge_score": 0.30552025659513393, "eval_wikibio_runtime": 30.4126, "eval_wikibio_samples_per_second": 16.441, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.33416672371854267, "eval_wikibio_token_set_f1_sem": 0.012909135706487137, "eval_wikibio_token_set_precision": 0.29553118934211425, "eval_wikibio_token_set_recall": 0.4239360182995451, "eval_wikibio_true_num_tokens": 98.71875, "step": 450000 }, { "epoch": 26.320407088962977, "eval_msmarco_accuracy": 0.394125, "eval_msmarco_bleu_score": 14.789837187383167, "eval_msmarco_bleu_score_sem": 1.3001993430493184, "eval_msmarco_emb_cos_sim": 0.7731555700302124, "eval_msmarco_emb_cos_sim_sem": 0.015427948907017708, "eval_msmarco_emb_top1_equal": 0.9453125, "eval_msmarco_emb_top1_equal_sem": 0.020175758749246597, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.786341667175293, "eval_msmarco_n_ngrams_match_1": 26.5390625, "eval_msmarco_n_ngrams_match_2": 11.25, "eval_msmarco_n_ngrams_match_3": 5.8828125, "eval_msmarco_num_pred_words": 59.625, "eval_msmarco_num_true_words": 62.859375, "eval_msmarco_perplexity": 5.967581086609374, "eval_msmarco_pred_num_tokens": 80.9140625, "eval_msmarco_rouge_score": 0.41293340729446876, "eval_msmarco_runtime": 24.8673, "eval_msmarco_samples_per_second": 20.107, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.4405843502061742, "eval_msmarco_token_set_f1_sem": 0.012836207894701412, "eval_msmarco_token_set_precision": 0.3999446217783437, "eval_msmarco_token_set_recall": 0.5076570493745421, "eval_msmarco_true_num_tokens": 80.515625, "step": 450000 }, { "epoch": 26.326256068316077, "grad_norm": 0.24877096712589264, "learning_rate": 0.001, "loss": 1.8848, "step": 450100 }, { "epoch": 26.33210504766918, "grad_norm": 0.1822071522474289, "learning_rate": 0.001, "loss": 1.8844, "step": 450200 }, { "epoch": 26.337954027022285, "grad_norm": 0.20113569498062134, "learning_rate": 0.001, "loss": 1.8861, "step": 450300 }, { "epoch": 26.34380300637539, "grad_norm": 0.19211407005786896, "learning_rate": 0.001, "loss": 1.8822, "step": 450400 }, { "epoch": 26.34965198572849, "grad_norm": 0.17030006647109985, "learning_rate": 0.001, "loss": 1.8844, "step": 450500 }, { "epoch": 26.355500965081593, "grad_norm": 0.2167540341615677, "learning_rate": 0.001, "loss": 1.876, "step": 450600 }, { "epoch": 26.361349944434696, "grad_norm": 0.18691189587116241, "learning_rate": 0.001, "loss": 1.887, "step": 450700 }, { "epoch": 26.3671989237878, "grad_norm": 0.19897635281085968, "learning_rate": 0.001, "loss": 1.8787, "step": 450800 }, { "epoch": 26.3730479031409, "grad_norm": 0.2091594785451889, "learning_rate": 0.001, "loss": 1.8832, "step": 450900 }, { "epoch": 26.378896882494004, "grad_norm": 0.2018042653799057, "learning_rate": 0.001, "loss": 1.8881, "step": 451000 }, { "epoch": 26.384745861847108, "grad_norm": 0.15245816111564636, "learning_rate": 0.001, "loss": 1.8825, "step": 451100 }, { "epoch": 26.390594841200212, "grad_norm": 0.2233721762895584, "learning_rate": 0.001, "loss": 1.8849, "step": 451200 }, { "epoch": 26.396443820553312, "grad_norm": 0.18882963061332703, "learning_rate": 0.001, "loss": 1.8806, "step": 451300 }, { "epoch": 26.402292799906416, "grad_norm": 0.1806231290102005, "learning_rate": 0.001, "loss": 1.8853, "step": 451400 }, { "epoch": 26.40814177925952, "grad_norm": 0.20246383547782898, "learning_rate": 0.001, "loss": 1.8849, "step": 451500 }, { "epoch": 26.413990758612623, "grad_norm": 0.19186639785766602, "learning_rate": 0.001, "loss": 1.8843, "step": 451600 }, { "epoch": 26.419839737965724, "grad_norm": 0.20863957703113556, "learning_rate": 0.001, "loss": 1.8839, "step": 451700 }, { "epoch": 26.425688717318828, "grad_norm": 0.1772914081811905, "learning_rate": 0.001, "loss": 1.8807, "step": 451800 }, { "epoch": 26.43153769667193, "grad_norm": 0.2827383577823639, "learning_rate": 0.001, "loss": 1.8919, "step": 451900 }, { "epoch": 26.437386676025035, "grad_norm": 0.18491989374160767, "learning_rate": 0.001, "loss": 1.8883, "step": 452000 }, { "epoch": 26.443235655378135, "grad_norm": 0.2868645191192627, "learning_rate": 0.001, "loss": 1.8865, "step": 452100 }, { "epoch": 26.44908463473124, "grad_norm": 0.18818561732769012, "learning_rate": 0.001, "loss": 1.8818, "step": 452200 }, { "epoch": 26.454933614084343, "grad_norm": 0.19434134662151337, "learning_rate": 0.001, "loss": 1.8823, "step": 452300 }, { "epoch": 26.460782593437447, "grad_norm": 0.20550362765789032, "learning_rate": 0.001, "loss": 1.8868, "step": 452400 }, { "epoch": 26.466631572790547, "grad_norm": 0.17535483837127686, "learning_rate": 0.001, "loss": 1.8877, "step": 452500 }, { "epoch": 26.47248055214365, "grad_norm": 0.1968081146478653, "learning_rate": 0.001, "loss": 1.8905, "step": 452600 }, { "epoch": 26.478329531496755, "grad_norm": 0.20289219915866852, "learning_rate": 0.001, "loss": 1.8859, "step": 452700 }, { "epoch": 26.48417851084986, "grad_norm": 0.2952752411365509, "learning_rate": 0.001, "loss": 1.8791, "step": 452800 }, { "epoch": 26.49002749020296, "grad_norm": 0.19226495921611786, "learning_rate": 0.001, "loss": 1.8945, "step": 452900 }, { "epoch": 26.495876469556062, "grad_norm": 0.20724110305309296, "learning_rate": 0.001, "loss": 1.8905, "step": 453000 }, { "epoch": 26.501725448909166, "grad_norm": 0.2260402888059616, "learning_rate": 0.001, "loss": 1.8869, "step": 453100 }, { "epoch": 26.507574428262267, "grad_norm": 0.21287445724010468, "learning_rate": 0.001, "loss": 1.8852, "step": 453200 }, { "epoch": 26.51342340761537, "grad_norm": 0.20599861443042755, "learning_rate": 0.001, "loss": 1.8815, "step": 453300 }, { "epoch": 26.519272386968474, "grad_norm": 0.18358854949474335, "learning_rate": 0.001, "loss": 1.8886, "step": 453400 }, { "epoch": 26.525121366321578, "grad_norm": 0.17543694376945496, "learning_rate": 0.001, "loss": 1.8792, "step": 453500 }, { "epoch": 26.530970345674678, "grad_norm": 0.21258607506752014, "learning_rate": 0.001, "loss": 1.8916, "step": 453600 }, { "epoch": 26.536819325027782, "grad_norm": 0.19329121708869934, "learning_rate": 0.001, "loss": 1.8868, "step": 453700 }, { "epoch": 26.542668304380886, "grad_norm": 0.20388230681419373, "learning_rate": 0.001, "loss": 1.8869, "step": 453800 }, { "epoch": 26.54851728373399, "grad_norm": 0.22681181132793427, "learning_rate": 0.001, "loss": 1.8857, "step": 453900 }, { "epoch": 26.55436626308709, "grad_norm": 0.21864460408687592, "learning_rate": 0.001, "loss": 1.888, "step": 454000 }, { "epoch": 26.560215242440194, "grad_norm": 0.19637876749038696, "learning_rate": 0.001, "loss": 1.8849, "step": 454100 }, { "epoch": 26.566064221793297, "grad_norm": 0.18391844630241394, "learning_rate": 0.001, "loss": 1.89, "step": 454200 }, { "epoch": 26.5719132011464, "grad_norm": 0.17293788492679596, "learning_rate": 0.001, "loss": 1.8849, "step": 454300 }, { "epoch": 26.5777621804995, "grad_norm": 0.18335677683353424, "learning_rate": 0.001, "loss": 1.8817, "step": 454400 }, { "epoch": 26.583611159852605, "grad_norm": 0.20333661139011383, "learning_rate": 0.001, "loss": 1.8888, "step": 454500 }, { "epoch": 26.58946013920571, "grad_norm": 0.18950176239013672, "learning_rate": 0.001, "loss": 1.8835, "step": 454600 }, { "epoch": 26.595309118558813, "grad_norm": 0.24932265281677246, "learning_rate": 0.001, "loss": 1.8871, "step": 454700 }, { "epoch": 26.601158097911913, "grad_norm": 0.18323881924152374, "learning_rate": 0.001, "loss": 1.8889, "step": 454800 }, { "epoch": 26.607007077265017, "grad_norm": 0.18632592260837555, "learning_rate": 0.001, "loss": 1.8905, "step": 454900 }, { "epoch": 26.61285605661812, "grad_norm": 0.23919980227947235, "learning_rate": 0.001, "loss": 1.8878, "step": 455000 }, { "epoch": 26.618705035971225, "grad_norm": 0.170438751578331, "learning_rate": 0.001, "loss": 1.893, "step": 455100 }, { "epoch": 26.624554015324325, "grad_norm": 0.22594211995601654, "learning_rate": 0.001, "loss": 1.8956, "step": 455200 }, { "epoch": 26.63040299467743, "grad_norm": 0.18281662464141846, "learning_rate": 0.001, "loss": 1.8807, "step": 455300 }, { "epoch": 26.636251974030532, "grad_norm": 0.20672886073589325, "learning_rate": 0.001, "loss": 1.8885, "step": 455400 }, { "epoch": 26.642100953383636, "grad_norm": 0.17604708671569824, "learning_rate": 0.001, "loss": 1.885, "step": 455500 }, { "epoch": 26.647949932736736, "grad_norm": 0.217324897646904, "learning_rate": 0.001, "loss": 1.8885, "step": 455600 }, { "epoch": 26.65379891208984, "grad_norm": 0.2071678340435028, "learning_rate": 0.001, "loss": 1.8893, "step": 455700 }, { "epoch": 26.659647891442944, "grad_norm": 0.21418310701847076, "learning_rate": 0.001, "loss": 1.8902, "step": 455800 }, { "epoch": 26.665496870796048, "grad_norm": 0.1609523594379425, "learning_rate": 0.001, "loss": 1.892, "step": 455900 }, { "epoch": 26.671345850149148, "grad_norm": 0.2989571690559387, "learning_rate": 0.001, "loss": 1.8822, "step": 456000 }, { "epoch": 26.677194829502252, "grad_norm": 0.1925773024559021, "learning_rate": 0.001, "loss": 1.8914, "step": 456100 }, { "epoch": 26.683043808855356, "grad_norm": 0.22298765182495117, "learning_rate": 0.001, "loss": 1.8929, "step": 456200 }, { "epoch": 26.688892788208456, "grad_norm": 0.18484103679656982, "learning_rate": 0.001, "loss": 1.8829, "step": 456300 }, { "epoch": 26.69474176756156, "grad_norm": 0.1704961210489273, "learning_rate": 0.001, "loss": 1.8894, "step": 456400 }, { "epoch": 26.700590746914663, "grad_norm": 0.19344785809516907, "learning_rate": 0.001, "loss": 1.885, "step": 456500 }, { "epoch": 26.706439726267767, "grad_norm": 0.18429480493068695, "learning_rate": 0.001, "loss": 1.8931, "step": 456600 }, { "epoch": 26.712288705620868, "grad_norm": 0.18210247159004211, "learning_rate": 0.001, "loss": 1.8849, "step": 456700 }, { "epoch": 26.71813768497397, "grad_norm": 0.17975804209709167, "learning_rate": 0.001, "loss": 1.886, "step": 456800 }, { "epoch": 26.723986664327075, "grad_norm": 0.2021581083536148, "learning_rate": 0.001, "loss": 1.8871, "step": 456900 }, { "epoch": 26.72983564368018, "grad_norm": 0.20506350696086884, "learning_rate": 0.001, "loss": 1.887, "step": 457000 }, { "epoch": 26.73568462303328, "grad_norm": 0.19142012298107147, "learning_rate": 0.001, "loss": 1.8834, "step": 457100 }, { "epoch": 26.741533602386383, "grad_norm": 0.15183162689208984, "learning_rate": 0.001, "loss": 1.8896, "step": 457200 }, { "epoch": 26.747382581739487, "grad_norm": 0.20700760185718536, "learning_rate": 0.001, "loss": 1.892, "step": 457300 }, { "epoch": 26.75323156109259, "grad_norm": 0.21625301241874695, "learning_rate": 0.001, "loss": 1.885, "step": 457400 }, { "epoch": 26.75908054044569, "grad_norm": 0.16572432219982147, "learning_rate": 0.001, "loss": 1.8873, "step": 457500 }, { "epoch": 26.764929519798795, "grad_norm": 0.18387676775455475, "learning_rate": 0.001, "loss": 1.8856, "step": 457600 }, { "epoch": 26.7707784991519, "grad_norm": 0.2105959951877594, "learning_rate": 0.001, "loss": 1.8815, "step": 457700 }, { "epoch": 26.776627478505002, "grad_norm": 0.20326904952526093, "learning_rate": 0.001, "loss": 1.893, "step": 457800 }, { "epoch": 26.782476457858102, "grad_norm": 0.15537326037883759, "learning_rate": 0.001, "loss": 1.8799, "step": 457900 }, { "epoch": 26.788325437211206, "grad_norm": 0.17568984627723694, "learning_rate": 0.001, "loss": 1.8849, "step": 458000 }, { "epoch": 26.79417441656431, "grad_norm": 0.2031925767660141, "learning_rate": 0.001, "loss": 1.8933, "step": 458100 }, { "epoch": 26.800023395917414, "grad_norm": 0.22728949785232544, "learning_rate": 0.001, "loss": 1.8872, "step": 458200 }, { "epoch": 26.805872375270514, "grad_norm": 0.16958408057689667, "learning_rate": 0.001, "loss": 1.8846, "step": 458300 }, { "epoch": 26.811721354623618, "grad_norm": 0.21249039471149445, "learning_rate": 0.001, "loss": 1.8933, "step": 458400 }, { "epoch": 26.81757033397672, "grad_norm": 0.18001309037208557, "learning_rate": 0.001, "loss": 1.8888, "step": 458500 }, { "epoch": 26.823419313329826, "grad_norm": 0.22910384833812714, "learning_rate": 0.001, "loss": 1.8902, "step": 458600 }, { "epoch": 26.829268292682926, "grad_norm": 0.1928349882364273, "learning_rate": 0.001, "loss": 1.8893, "step": 458700 }, { "epoch": 26.83511727203603, "grad_norm": 0.21982762217521667, "learning_rate": 0.001, "loss": 1.8835, "step": 458800 }, { "epoch": 26.840966251389133, "grad_norm": 0.16378378868103027, "learning_rate": 0.001, "loss": 1.8889, "step": 458900 }, { "epoch": 26.846815230742237, "grad_norm": 0.18309125304222107, "learning_rate": 0.001, "loss": 1.8901, "step": 459000 }, { "epoch": 26.852664210095337, "grad_norm": 0.2286805808544159, "learning_rate": 0.001, "loss": 1.8894, "step": 459100 }, { "epoch": 26.85851318944844, "grad_norm": 0.2295795977115631, "learning_rate": 0.001, "loss": 1.8834, "step": 459200 }, { "epoch": 26.864362168801545, "grad_norm": 0.18922993540763855, "learning_rate": 0.001, "loss": 1.8884, "step": 459300 }, { "epoch": 26.870211148154645, "grad_norm": 0.218716561794281, "learning_rate": 0.001, "loss": 1.8866, "step": 459400 }, { "epoch": 26.87606012750775, "grad_norm": 0.18516267836093903, "learning_rate": 0.001, "loss": 1.8879, "step": 459500 }, { "epoch": 26.881909106860853, "grad_norm": 0.1950608491897583, "learning_rate": 0.001, "loss": 1.8902, "step": 459600 }, { "epoch": 26.887758086213957, "grad_norm": 0.17977631092071533, "learning_rate": 0.001, "loss": 1.8897, "step": 459700 }, { "epoch": 26.893607065567057, "grad_norm": 0.15867434442043304, "learning_rate": 0.001, "loss": 1.8871, "step": 459800 }, { "epoch": 26.89945604492016, "grad_norm": 0.15193867683410645, "learning_rate": 0.001, "loss": 1.8872, "step": 459900 }, { "epoch": 26.905305024273265, "grad_norm": 0.19411782920360565, "learning_rate": 0.001, "loss": 1.884, "step": 460000 }, { "epoch": 26.91115400362637, "grad_norm": 0.23771819472312927, "learning_rate": 0.001, "loss": 1.8904, "step": 460100 }, { "epoch": 26.91700298297947, "grad_norm": 0.18789266049861908, "learning_rate": 0.001, "loss": 1.8884, "step": 460200 }, { "epoch": 26.922851962332572, "grad_norm": 0.1850876361131668, "learning_rate": 0.001, "loss": 1.8892, "step": 460300 }, { "epoch": 26.928700941685676, "grad_norm": 0.18759313225746155, "learning_rate": 0.001, "loss": 1.8869, "step": 460400 }, { "epoch": 26.93454992103878, "grad_norm": 0.17634856700897217, "learning_rate": 0.001, "loss": 1.8905, "step": 460500 }, { "epoch": 26.94039890039188, "grad_norm": 0.22936806082725525, "learning_rate": 0.001, "loss": 1.8868, "step": 460600 }, { "epoch": 26.946247879744984, "grad_norm": 0.1804109662771225, "learning_rate": 0.001, "loss": 1.8835, "step": 460700 }, { "epoch": 26.952096859098088, "grad_norm": 0.2012353092432022, "learning_rate": 0.001, "loss": 1.8856, "step": 460800 }, { "epoch": 26.95794583845119, "grad_norm": 0.18253524601459503, "learning_rate": 0.001, "loss": 1.8847, "step": 460900 }, { "epoch": 26.963794817804292, "grad_norm": 0.21425698697566986, "learning_rate": 0.001, "loss": 1.8931, "step": 461000 }, { "epoch": 26.969643797157396, "grad_norm": 0.24230709671974182, "learning_rate": 0.001, "loss": 1.8887, "step": 461100 }, { "epoch": 26.9754927765105, "grad_norm": 0.2124946117401123, "learning_rate": 0.001, "loss": 1.8838, "step": 461200 }, { "epoch": 26.981341755863603, "grad_norm": 0.17656660079956055, "learning_rate": 0.001, "loss": 1.8786, "step": 461300 }, { "epoch": 26.987190735216704, "grad_norm": 0.23254182934761047, "learning_rate": 0.001, "loss": 1.886, "step": 461400 }, { "epoch": 26.993039714569807, "grad_norm": 0.16892674565315247, "learning_rate": 0.001, "loss": 1.8933, "step": 461500 }, { "epoch": 26.99888869392291, "grad_norm": 0.20199765264987946, "learning_rate": 0.001, "loss": 1.8815, "step": 461600 }, { "epoch": 27.004737673276015, "grad_norm": 0.2310163527727127, "learning_rate": 0.001, "loss": 1.8856, "step": 461700 }, { "epoch": 27.010586652629115, "grad_norm": 0.2799108326435089, "learning_rate": 0.001, "loss": 1.8766, "step": 461800 }, { "epoch": 27.01643563198222, "grad_norm": 0.19701004028320312, "learning_rate": 0.001, "loss": 1.8711, "step": 461900 }, { "epoch": 27.022284611335323, "grad_norm": 0.2157520353794098, "learning_rate": 0.001, "loss": 1.8768, "step": 462000 }, { "epoch": 27.028133590688427, "grad_norm": 0.20718543231487274, "learning_rate": 0.001, "loss": 1.8701, "step": 462100 }, { "epoch": 27.033982570041527, "grad_norm": 0.2232256978750229, "learning_rate": 0.001, "loss": 1.8726, "step": 462200 }, { "epoch": 27.03983154939463, "grad_norm": 0.2157769650220871, "learning_rate": 0.001, "loss": 1.8762, "step": 462300 }, { "epoch": 27.045680528747734, "grad_norm": 0.24513451755046844, "learning_rate": 0.001, "loss": 1.874, "step": 462400 }, { "epoch": 27.051529508100835, "grad_norm": 0.26174890995025635, "learning_rate": 0.001, "loss": 1.8766, "step": 462500 }, { "epoch": 27.05737848745394, "grad_norm": 0.2002512812614441, "learning_rate": 0.001, "loss": 1.8776, "step": 462600 }, { "epoch": 27.063227466807042, "grad_norm": 0.21077637374401093, "learning_rate": 0.001, "loss": 1.873, "step": 462700 }, { "epoch": 27.069076446160146, "grad_norm": 0.2828896939754486, "learning_rate": 0.001, "loss": 1.873, "step": 462800 }, { "epoch": 27.074925425513246, "grad_norm": 0.21767908334732056, "learning_rate": 0.001, "loss": 1.875, "step": 462900 }, { "epoch": 27.08077440486635, "grad_norm": 0.1986842006444931, "learning_rate": 0.001, "loss": 1.8735, "step": 463000 }, { "epoch": 27.086623384219454, "grad_norm": 0.19123350083827972, "learning_rate": 0.001, "loss": 1.8765, "step": 463100 }, { "epoch": 27.092472363572558, "grad_norm": 0.1855892390012741, "learning_rate": 0.001, "loss": 1.8745, "step": 463200 }, { "epoch": 27.098321342925658, "grad_norm": 0.23250025510787964, "learning_rate": 0.001, "loss": 1.8754, "step": 463300 }, { "epoch": 27.10417032227876, "grad_norm": 0.27218034863471985, "learning_rate": 0.001, "loss": 1.8751, "step": 463400 }, { "epoch": 27.110019301631866, "grad_norm": 0.20772784948349, "learning_rate": 0.001, "loss": 1.8797, "step": 463500 }, { "epoch": 27.11586828098497, "grad_norm": 0.22974234819412231, "learning_rate": 0.001, "loss": 1.8747, "step": 463600 }, { "epoch": 27.12171726033807, "grad_norm": 0.19219501316547394, "learning_rate": 0.001, "loss": 1.8766, "step": 463700 }, { "epoch": 27.127566239691173, "grad_norm": 0.20245321094989777, "learning_rate": 0.001, "loss": 1.8769, "step": 463800 }, { "epoch": 27.133415219044277, "grad_norm": 0.18863539397716522, "learning_rate": 0.001, "loss": 1.8727, "step": 463900 }, { "epoch": 27.13926419839738, "grad_norm": 0.28572651743888855, "learning_rate": 0.001, "loss": 1.8796, "step": 464000 }, { "epoch": 27.14511317775048, "grad_norm": 0.2487834244966507, "learning_rate": 0.001, "loss": 1.8768, "step": 464100 }, { "epoch": 27.150962157103585, "grad_norm": 0.2018076628446579, "learning_rate": 0.001, "loss": 1.8791, "step": 464200 }, { "epoch": 27.15681113645669, "grad_norm": 0.2027546912431717, "learning_rate": 0.001, "loss": 1.8747, "step": 464300 }, { "epoch": 27.162660115809793, "grad_norm": 0.18204060196876526, "learning_rate": 0.001, "loss": 1.8728, "step": 464400 }, { "epoch": 27.168509095162893, "grad_norm": 0.20569618046283722, "learning_rate": 0.001, "loss": 1.8741, "step": 464500 }, { "epoch": 27.174358074515997, "grad_norm": 0.24429886043071747, "learning_rate": 0.001, "loss": 1.876, "step": 464600 }, { "epoch": 27.1802070538691, "grad_norm": 0.1798180788755417, "learning_rate": 0.001, "loss": 1.8761, "step": 464700 }, { "epoch": 27.186056033222204, "grad_norm": 0.18970635533332825, "learning_rate": 0.001, "loss": 1.8805, "step": 464800 }, { "epoch": 27.191905012575305, "grad_norm": 0.2055460512638092, "learning_rate": 0.001, "loss": 1.8732, "step": 464900 }, { "epoch": 27.19775399192841, "grad_norm": 0.20703928172588348, "learning_rate": 0.001, "loss": 1.8803, "step": 465000 }, { "epoch": 27.203602971281512, "grad_norm": 0.21577531099319458, "learning_rate": 0.001, "loss": 1.8788, "step": 465100 }, { "epoch": 27.209451950634616, "grad_norm": 0.23349136114120483, "learning_rate": 0.001, "loss": 1.8797, "step": 465200 }, { "epoch": 27.215300929987716, "grad_norm": 0.20168055593967438, "learning_rate": 0.001, "loss": 1.8807, "step": 465300 }, { "epoch": 27.22114990934082, "grad_norm": 0.3506438732147217, "learning_rate": 0.001, "loss": 1.8781, "step": 465400 }, { "epoch": 27.226998888693924, "grad_norm": 0.22542600333690643, "learning_rate": 0.001, "loss": 1.8811, "step": 465500 }, { "epoch": 27.232847868047024, "grad_norm": 0.19690291583538055, "learning_rate": 0.001, "loss": 1.877, "step": 465600 }, { "epoch": 27.238696847400128, "grad_norm": 0.17599958181381226, "learning_rate": 0.001, "loss": 1.8813, "step": 465700 }, { "epoch": 27.24454582675323, "grad_norm": 0.1692051738500595, "learning_rate": 0.001, "loss": 1.8736, "step": 465800 }, { "epoch": 27.250394806106335, "grad_norm": 0.2203117311000824, "learning_rate": 0.001, "loss": 1.8754, "step": 465900 }, { "epoch": 27.256243785459436, "grad_norm": 0.19353993237018585, "learning_rate": 0.001, "loss": 1.881, "step": 466000 }, { "epoch": 27.26209276481254, "grad_norm": 0.1775546818971634, "learning_rate": 0.001, "loss": 1.8797, "step": 466100 }, { "epoch": 27.267941744165643, "grad_norm": 0.21625176072120667, "learning_rate": 0.001, "loss": 1.8775, "step": 466200 }, { "epoch": 27.273790723518747, "grad_norm": 0.20213942229747772, "learning_rate": 0.001, "loss": 1.878, "step": 466300 }, { "epoch": 27.279639702871847, "grad_norm": 0.1657015085220337, "learning_rate": 0.001, "loss": 1.8813, "step": 466400 }, { "epoch": 27.28548868222495, "grad_norm": 0.2142123430967331, "learning_rate": 0.001, "loss": 1.8806, "step": 466500 }, { "epoch": 27.291337661578055, "grad_norm": 0.254354864358902, "learning_rate": 0.001, "loss": 1.8764, "step": 466600 }, { "epoch": 27.29718664093116, "grad_norm": 0.2142970860004425, "learning_rate": 0.001, "loss": 1.8783, "step": 466700 }, { "epoch": 27.30303562028426, "grad_norm": 0.1964450478553772, "learning_rate": 0.001, "loss": 1.8808, "step": 466800 }, { "epoch": 27.308884599637363, "grad_norm": 0.20611467957496643, "learning_rate": 0.001, "loss": 1.8733, "step": 466900 }, { "epoch": 27.314733578990467, "grad_norm": 0.21388280391693115, "learning_rate": 0.001, "loss": 1.8759, "step": 467000 }, { "epoch": 27.32058255834357, "grad_norm": 0.24167130887508392, "learning_rate": 0.001, "loss": 1.8742, "step": 467100 }, { "epoch": 27.32643153769667, "grad_norm": 0.23808713257312775, "learning_rate": 0.001, "loss": 1.8769, "step": 467200 }, { "epoch": 27.332280517049774, "grad_norm": 0.21170417964458466, "learning_rate": 0.001, "loss": 1.8758, "step": 467300 }, { "epoch": 27.33812949640288, "grad_norm": 0.23030203580856323, "learning_rate": 0.001, "loss": 1.8822, "step": 467400 }, { "epoch": 27.343978475755982, "grad_norm": 0.23403523862361908, "learning_rate": 0.001, "loss": 1.8782, "step": 467500 }, { "epoch": 27.349827455109082, "grad_norm": 0.1684146672487259, "learning_rate": 0.001, "loss": 1.8833, "step": 467600 }, { "epoch": 27.355676434462186, "grad_norm": 0.18146812915802002, "learning_rate": 0.001, "loss": 1.8765, "step": 467700 }, { "epoch": 27.36152541381529, "grad_norm": 0.18425408005714417, "learning_rate": 0.001, "loss": 1.8819, "step": 467800 }, { "epoch": 27.367374393168394, "grad_norm": 0.2518150508403778, "learning_rate": 0.001, "loss": 1.8804, "step": 467900 }, { "epoch": 27.373223372521494, "grad_norm": 0.19972673058509827, "learning_rate": 0.001, "loss": 1.8798, "step": 468000 }, { "epoch": 27.379072351874598, "grad_norm": 0.19616582989692688, "learning_rate": 0.001, "loss": 1.8795, "step": 468100 }, { "epoch": 27.3849213312277, "grad_norm": 0.2230408936738968, "learning_rate": 0.001, "loss": 1.8822, "step": 468200 }, { "epoch": 27.390770310580805, "grad_norm": 0.20752061903476715, "learning_rate": 0.001, "loss": 1.8811, "step": 468300 }, { "epoch": 27.396619289933906, "grad_norm": 0.21486137807369232, "learning_rate": 0.001, "loss": 1.8791, "step": 468400 }, { "epoch": 27.40246826928701, "grad_norm": 0.20917899906635284, "learning_rate": 0.001, "loss": 1.8842, "step": 468500 }, { "epoch": 27.408317248640113, "grad_norm": 0.2509396970272064, "learning_rate": 0.001, "loss": 1.8787, "step": 468600 }, { "epoch": 27.414166227993213, "grad_norm": 0.21402645111083984, "learning_rate": 0.001, "loss": 1.8772, "step": 468700 }, { "epoch": 27.420015207346317, "grad_norm": 0.18630413711071014, "learning_rate": 0.001, "loss": 1.8774, "step": 468800 }, { "epoch": 27.42586418669942, "grad_norm": 0.1720217913389206, "learning_rate": 0.001, "loss": 1.8823, "step": 468900 }, { "epoch": 27.431713166052525, "grad_norm": 0.1725466400384903, "learning_rate": 0.001, "loss": 1.8795, "step": 469000 }, { "epoch": 27.437562145405625, "grad_norm": 0.1951526701450348, "learning_rate": 0.001, "loss": 1.8831, "step": 469100 }, { "epoch": 27.44341112475873, "grad_norm": 0.2614439129829407, "learning_rate": 0.001, "loss": 1.8833, "step": 469200 }, { "epoch": 27.449260104111833, "grad_norm": 0.2942865490913391, "learning_rate": 0.001, "loss": 1.8806, "step": 469300 }, { "epoch": 27.455109083464937, "grad_norm": 0.2701260447502136, "learning_rate": 0.001, "loss": 1.8808, "step": 469400 }, { "epoch": 27.460958062818037, "grad_norm": 0.17246410250663757, "learning_rate": 0.001, "loss": 1.8806, "step": 469500 }, { "epoch": 27.46680704217114, "grad_norm": 0.17297708988189697, "learning_rate": 0.001, "loss": 1.8794, "step": 469600 }, { "epoch": 27.472656021524244, "grad_norm": 0.22070498764514923, "learning_rate": 0.001, "loss": 1.8802, "step": 469700 }, { "epoch": 27.478505000877348, "grad_norm": 0.17674313485622406, "learning_rate": 0.001, "loss": 1.8857, "step": 469800 }, { "epoch": 27.48435398023045, "grad_norm": 0.15656931698322296, "learning_rate": 0.001, "loss": 1.8825, "step": 469900 }, { "epoch": 27.490202959583552, "grad_norm": 0.2608446776866913, "learning_rate": 0.001, "loss": 1.8744, "step": 470000 }, { "epoch": 27.496051938936656, "grad_norm": 0.18035222589969635, "learning_rate": 0.001, "loss": 1.8804, "step": 470100 }, { "epoch": 27.50190091828976, "grad_norm": 0.1587059050798416, "learning_rate": 0.001, "loss": 1.8826, "step": 470200 }, { "epoch": 27.50774989764286, "grad_norm": 0.16080324351787567, "learning_rate": 0.001, "loss": 1.8825, "step": 470300 }, { "epoch": 27.513598876995964, "grad_norm": 0.15345105528831482, "learning_rate": 0.001, "loss": 1.8771, "step": 470400 }, { "epoch": 27.519447856349068, "grad_norm": 0.24135199189186096, "learning_rate": 0.001, "loss": 1.8775, "step": 470500 }, { "epoch": 27.52529683570217, "grad_norm": 0.18674467504024506, "learning_rate": 0.001, "loss": 1.8826, "step": 470600 }, { "epoch": 27.53114581505527, "grad_norm": 0.25785887241363525, "learning_rate": 0.001, "loss": 1.8833, "step": 470700 }, { "epoch": 27.536994794408375, "grad_norm": 0.2229098379611969, "learning_rate": 0.001, "loss": 1.8772, "step": 470800 }, { "epoch": 27.54284377376148, "grad_norm": 0.33261412382125854, "learning_rate": 0.001, "loss": 1.8864, "step": 470900 }, { "epoch": 27.548692753114583, "grad_norm": 0.22311310470104218, "learning_rate": 0.001, "loss": 1.8822, "step": 471000 }, { "epoch": 27.554541732467683, "grad_norm": 0.1716281622648239, "learning_rate": 0.001, "loss": 1.8772, "step": 471100 }, { "epoch": 27.560390711820787, "grad_norm": 0.20102442800998688, "learning_rate": 0.001, "loss": 1.8826, "step": 471200 }, { "epoch": 27.56623969117389, "grad_norm": 0.19300752878189087, "learning_rate": 0.001, "loss": 1.8812, "step": 471300 }, { "epoch": 27.572088670526995, "grad_norm": 0.16570629179477692, "learning_rate": 0.001, "loss": 1.8866, "step": 471400 }, { "epoch": 27.577937649880095, "grad_norm": 0.19103528559207916, "learning_rate": 0.001, "loss": 1.8799, "step": 471500 }, { "epoch": 27.5837866292332, "grad_norm": 0.2234923094511032, "learning_rate": 0.001, "loss": 1.8798, "step": 471600 }, { "epoch": 27.589635608586303, "grad_norm": 0.21499072015285492, "learning_rate": 0.001, "loss": 1.8873, "step": 471700 }, { "epoch": 27.595484587939403, "grad_norm": 0.21738286316394806, "learning_rate": 0.001, "loss": 1.8818, "step": 471800 }, { "epoch": 27.601333567292507, "grad_norm": 0.19235672056674957, "learning_rate": 0.001, "loss": 1.885, "step": 471900 }, { "epoch": 27.60718254664561, "grad_norm": 0.18957093358039856, "learning_rate": 0.001, "loss": 1.8775, "step": 472000 }, { "epoch": 27.613031525998714, "grad_norm": 0.17502819001674652, "learning_rate": 0.001, "loss": 1.8836, "step": 472100 }, { "epoch": 27.618880505351814, "grad_norm": 0.18870848417282104, "learning_rate": 0.001, "loss": 1.8729, "step": 472200 }, { "epoch": 27.62472948470492, "grad_norm": 0.16994498670101166, "learning_rate": 0.001, "loss": 1.8857, "step": 472300 }, { "epoch": 27.630578464058022, "grad_norm": 0.21591901779174805, "learning_rate": 0.001, "loss": 1.8815, "step": 472400 }, { "epoch": 27.636427443411126, "grad_norm": 0.24209529161453247, "learning_rate": 0.001, "loss": 1.8874, "step": 472500 }, { "epoch": 27.642276422764226, "grad_norm": 0.2539321482181549, "learning_rate": 0.001, "loss": 1.8788, "step": 472600 }, { "epoch": 27.64812540211733, "grad_norm": 0.24150699377059937, "learning_rate": 0.001, "loss": 1.8873, "step": 472700 }, { "epoch": 27.653974381470434, "grad_norm": 0.24105815589427948, "learning_rate": 0.001, "loss": 1.8836, "step": 472800 }, { "epoch": 27.659823360823538, "grad_norm": 0.16312375664710999, "learning_rate": 0.001, "loss": 1.8858, "step": 472900 }, { "epoch": 27.665672340176638, "grad_norm": 0.183799147605896, "learning_rate": 0.001, "loss": 1.8803, "step": 473000 }, { "epoch": 27.67152131952974, "grad_norm": 0.18777760863304138, "learning_rate": 0.001, "loss": 1.8883, "step": 473100 }, { "epoch": 27.677370298882845, "grad_norm": 0.2106543630361557, "learning_rate": 0.001, "loss": 1.8795, "step": 473200 }, { "epoch": 27.68321927823595, "grad_norm": 0.22219474613666534, "learning_rate": 0.001, "loss": 1.8807, "step": 473300 }, { "epoch": 27.68906825758905, "grad_norm": 0.3776751160621643, "learning_rate": 0.001, "loss": 1.8841, "step": 473400 }, { "epoch": 27.694917236942153, "grad_norm": 0.2638006806373596, "learning_rate": 0.001, "loss": 1.8906, "step": 473500 }, { "epoch": 27.700766216295257, "grad_norm": 0.20743195712566376, "learning_rate": 0.001, "loss": 1.8817, "step": 473600 }, { "epoch": 27.70661519564836, "grad_norm": 0.15839555859565735, "learning_rate": 0.001, "loss": 1.8788, "step": 473700 }, { "epoch": 27.71246417500146, "grad_norm": 0.18707160651683807, "learning_rate": 0.001, "loss": 1.89, "step": 473800 }, { "epoch": 27.718313154354565, "grad_norm": 0.21359993517398834, "learning_rate": 0.001, "loss": 1.8847, "step": 473900 }, { "epoch": 27.72416213370767, "grad_norm": 0.24090950191020966, "learning_rate": 0.001, "loss": 1.8832, "step": 474000 }, { "epoch": 27.730011113060772, "grad_norm": 0.22400347888469696, "learning_rate": 0.001, "loss": 1.884, "step": 474100 }, { "epoch": 27.735860092413873, "grad_norm": 0.2665601372718811, "learning_rate": 0.001, "loss": 1.8803, "step": 474200 }, { "epoch": 27.741709071766977, "grad_norm": 0.23827986419200897, "learning_rate": 0.001, "loss": 1.8841, "step": 474300 }, { "epoch": 27.74755805112008, "grad_norm": 0.17497430741786957, "learning_rate": 0.001, "loss": 1.8845, "step": 474400 }, { "epoch": 27.753407030473184, "grad_norm": 0.21503156423568726, "learning_rate": 0.001, "loss": 1.8862, "step": 474500 }, { "epoch": 27.759256009826284, "grad_norm": 0.1856810599565506, "learning_rate": 0.001, "loss": 1.8834, "step": 474600 }, { "epoch": 27.765104989179388, "grad_norm": 0.215917706489563, "learning_rate": 0.001, "loss": 1.8793, "step": 474700 }, { "epoch": 27.770953968532492, "grad_norm": 0.21282941102981567, "learning_rate": 0.001, "loss": 1.8854, "step": 474800 }, { "epoch": 27.776802947885592, "grad_norm": 0.20843659341335297, "learning_rate": 0.001, "loss": 1.8827, "step": 474900 }, { "epoch": 27.782651927238696, "grad_norm": 0.26443788409233093, "learning_rate": 0.001, "loss": 1.8857, "step": 475000 }, { "epoch": 27.782651927238696, "eval_ag_news_accuracy": 0.2394375, "eval_ag_news_bleu_score": 6.899297319732321, "eval_ag_news_bleu_score_sem": 0.4559507368300325, "eval_ag_news_emb_cos_sim": 0.7131271362304688, "eval_ag_news_emb_cos_sim_sem": 0.012669695541262627, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.71541690826416, "eval_ag_news_n_ngrams_match_1": 13.8046875, "eval_ag_news_n_ngrams_match_2": 4.03125, "eval_ag_news_n_ngrams_match_3": 1.4609375, "eval_ag_news_num_pred_words": 46.8359375, "eval_ag_news_num_true_words": 45.359375, "eval_ag_news_perplexity": 15.110908621528758, "eval_ag_news_pred_num_tokens": 69.140625, "eval_ag_news_rouge_score": 0.29320743981043484, "eval_ag_news_runtime": 38.4637, "eval_ag_news_samples_per_second": 12.999, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3259622134108434, "eval_ag_news_token_set_f1_sem": 0.009150944124870015, "eval_ag_news_token_set_precision": 0.3039940801064256, "eval_ag_news_token_set_recall": 0.3638380773644121, "eval_ag_news_true_num_tokens": 62.8125, "step": 475000 }, { "epoch": 27.782651927238696, "eval_anthropic_toxic_prompts_accuracy": 0.103, "eval_anthropic_toxic_prompts_bleu_score": 40.60312420412993, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.520042799425231, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8673845529556274, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010618932545185089, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.109375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02769520686450485, "eval_anthropic_toxic_prompts_loss": 1.293805718421936, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.5234375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.171875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.3515625, "eval_anthropic_toxic_prompts_num_pred_words": 13.9609375, "eval_anthropic_toxic_prompts_num_true_words": 14.4453125, "eval_anthropic_toxic_prompts_perplexity": 3.6466382595948255, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.2109375, "eval_anthropic_toxic_prompts_rouge_score": 0.6756264457080077, "eval_anthropic_toxic_prompts_runtime": 31.2374, "eval_anthropic_toxic_prompts_samples_per_second": 16.006, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.685892995037993, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.020122246776592415, "eval_anthropic_toxic_prompts_token_set_precision": 0.6764306431742342, "eval_anthropic_toxic_prompts_token_set_recall": 0.70176578927759, "eval_anthropic_toxic_prompts_true_num_tokens": 17.4765625, "step": 475000 }, { "epoch": 27.782651927238696, "eval_arxiv_accuracy": 0.37271875, "eval_arxiv_bleu_score": 1.7552716943366127, "eval_arxiv_bleu_score_sem": 0.1996080931728112, "eval_arxiv_emb_cos_sim": 0.5255429148674011, "eval_arxiv_emb_cos_sim_sem": 0.018397655338048935, "eval_arxiv_emb_top1_equal": 0.890625, "eval_arxiv_emb_top1_equal_sem": 0.02769520878791809, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.43660044670105, "eval_arxiv_n_ngrams_match_1": 13.8515625, "eval_arxiv_n_ngrams_match_2": 2.546875, "eval_arxiv_n_ngrams_match_3": 0.5078125, "eval_arxiv_num_pred_words": 53.03125, "eval_arxiv_num_true_words": 86.6328125, "eval_arxiv_perplexity": 31.081116450997836, "eval_arxiv_pred_num_tokens": 125.1484375, "eval_arxiv_rouge_score": 0.18710931162824213, "eval_arxiv_runtime": 32.0172, "eval_arxiv_samples_per_second": 15.617, "eval_arxiv_steps_per_second": 0.031, "eval_arxiv_token_set_f1": 0.17900969600704972, "eval_arxiv_token_set_f1_sem": 0.00797962356402482, "eval_arxiv_token_set_precision": 0.1209767743687986, "eval_arxiv_token_set_recall": 0.4483120636413935, "eval_arxiv_true_num_tokens": 125.7421875, "step": 475000 }, { "epoch": 27.782651927238696, "eval_python_code_alpaca_accuracy": 0.130546875, "eval_python_code_alpaca_bleu_score": 31.35263415219807, "eval_python_code_alpaca_bleu_score_sem": 1.9005194586691452, "eval_python_code_alpaca_emb_cos_sim": 0.8638502955436707, "eval_python_code_alpaca_emb_cos_sim_sem": 0.010257664136588573, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.5060213804244995, "eval_python_code_alpaca_n_ngrams_match_1": 10.4765625, "eval_python_code_alpaca_n_ngrams_match_2": 6.0234375, "eval_python_code_alpaca_n_ngrams_match_3": 3.6484375, "eval_python_code_alpaca_num_pred_words": 16.9375, "eval_python_code_alpaca_num_true_words": 18.6640625, "eval_python_code_alpaca_perplexity": 4.50875643484249, "eval_python_code_alpaca_pred_num_tokens": 22.6953125, "eval_python_code_alpaca_rouge_score": 0.619165405919215, "eval_python_code_alpaca_runtime": 30.6688, "eval_python_code_alpaca_samples_per_second": 16.303, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6338723170266828, "eval_python_code_alpaca_token_set_f1_sem": 0.015053170567771852, "eval_python_code_alpaca_token_set_precision": 0.6175957904074754, "eval_python_code_alpaca_token_set_recall": 0.6571119935505073, "eval_python_code_alpaca_true_num_tokens": 24.109375, "step": 475000 }, { "epoch": 27.782651927238696, "eval_wikibio_accuracy": 0.362546875, "eval_wikibio_bleu_score": 7.387086910274958, "eval_wikibio_bleu_score_sem": 0.7338809498529998, "eval_wikibio_emb_cos_sim": 0.6027933359146118, "eval_wikibio_emb_cos_sim_sem": 0.022047193720936775, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7460885047912598, "eval_wikibio_n_ngrams_match_1": 14.3671875, "eval_wikibio_n_ngrams_match_2": 4.8671875, "eval_wikibio_n_ngrams_match_3": 2.0234375, "eval_wikibio_num_pred_words": 50.953125, "eval_wikibio_num_true_words": 50.515625, "eval_wikibio_perplexity": 15.581565313179853, "eval_wikibio_pred_num_tokens": 105.5078125, "eval_wikibio_rouge_score": 0.29057615201272935, "eval_wikibio_runtime": 32.3473, "eval_wikibio_samples_per_second": 15.457, "eval_wikibio_steps_per_second": 0.031, "eval_wikibio_token_set_f1": 0.3130404011643629, "eval_wikibio_token_set_f1_sem": 0.013646565042948255, "eval_wikibio_token_set_precision": 0.2779570090833749, "eval_wikibio_token_set_recall": 0.38799668596279707, "eval_wikibio_true_num_tokens": 97.6484375, "step": 475000 }, { "epoch": 27.782651927238696, "eval_msmarco_accuracy": 0.391328125, "eval_msmarco_bleu_score": 14.798786883008553, "eval_msmarco_bleu_score_sem": 1.126484583819029, "eval_msmarco_emb_cos_sim": 0.7746825218200684, "eval_msmarco_emb_cos_sim_sem": 0.014929384924471378, "eval_msmarco_emb_top1_equal": 0.921875, "eval_msmarco_emb_top1_equal_sem": 0.023813825100660324, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7505320310592651, "eval_msmarco_n_ngrams_match_1": 27.1328125, "eval_msmarco_n_ngrams_match_2": 11.890625, "eval_msmarco_n_ngrams_match_3": 6.28125, "eval_msmarco_num_pred_words": 59.9140625, "eval_msmarco_num_true_words": 61.484375, "eval_msmarco_perplexity": 5.757665117947973, "eval_msmarco_pred_num_tokens": 83.1015625, "eval_msmarco_rouge_score": 0.4210620505857927, "eval_msmarco_runtime": 26.0018, "eval_msmarco_samples_per_second": 19.229, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.459068879688171, "eval_msmarco_token_set_f1_sem": 0.013118263100368312, "eval_msmarco_token_set_precision": 0.4170361410642021, "eval_msmarco_token_set_recall": 0.534669779509068, "eval_msmarco_true_num_tokens": 80.015625, "step": 475000 }, { "epoch": 27.7885009065918, "grad_norm": 0.19515375792980194, "learning_rate": 0.001, "loss": 1.8788, "step": 475100 }, { "epoch": 27.794349885944904, "grad_norm": 0.22348304092884064, "learning_rate": 0.001, "loss": 1.8811, "step": 475200 }, { "epoch": 27.800198865298004, "grad_norm": 0.3271535038948059, "learning_rate": 0.001, "loss": 1.8814, "step": 475300 }, { "epoch": 27.806047844651108, "grad_norm": 0.21075096726417542, "learning_rate": 0.001, "loss": 1.8831, "step": 475400 }, { "epoch": 27.81189682400421, "grad_norm": 0.19126680493354797, "learning_rate": 0.001, "loss": 1.8801, "step": 475500 }, { "epoch": 27.817745803357315, "grad_norm": 0.23818297684192657, "learning_rate": 0.001, "loss": 1.8798, "step": 475600 }, { "epoch": 27.823594782710416, "grad_norm": 0.19889572262763977, "learning_rate": 0.001, "loss": 1.8833, "step": 475700 }, { "epoch": 27.82944376206352, "grad_norm": 0.21203897893428802, "learning_rate": 0.001, "loss": 1.8824, "step": 475800 }, { "epoch": 27.835292741416623, "grad_norm": 0.2323421835899353, "learning_rate": 0.001, "loss": 1.8803, "step": 475900 }, { "epoch": 27.841141720769727, "grad_norm": 0.20353193581104279, "learning_rate": 0.001, "loss": 1.8797, "step": 476000 }, { "epoch": 27.846990700122827, "grad_norm": 0.20402802526950836, "learning_rate": 0.001, "loss": 1.8832, "step": 476100 }, { "epoch": 27.85283967947593, "grad_norm": 0.21019700169563293, "learning_rate": 0.001, "loss": 1.878, "step": 476200 }, { "epoch": 27.858688658829035, "grad_norm": 0.2034255564212799, "learning_rate": 0.001, "loss": 1.8842, "step": 476300 }, { "epoch": 27.86453763818214, "grad_norm": 0.20999424159526825, "learning_rate": 0.001, "loss": 1.8844, "step": 476400 }, { "epoch": 27.87038661753524, "grad_norm": 0.2828521132469177, "learning_rate": 0.001, "loss": 1.8833, "step": 476500 }, { "epoch": 27.876235596888343, "grad_norm": 0.22617198526859283, "learning_rate": 0.001, "loss": 1.888, "step": 476600 }, { "epoch": 27.882084576241446, "grad_norm": 0.18821506202220917, "learning_rate": 0.001, "loss": 1.8815, "step": 476700 }, { "epoch": 27.88793355559455, "grad_norm": 0.21253404021263123, "learning_rate": 0.001, "loss": 1.8778, "step": 476800 }, { "epoch": 27.89378253494765, "grad_norm": 0.21332773566246033, "learning_rate": 0.001, "loss": 1.8788, "step": 476900 }, { "epoch": 27.899631514300754, "grad_norm": 0.24053050577640533, "learning_rate": 0.001, "loss": 1.8839, "step": 477000 }, { "epoch": 27.905480493653858, "grad_norm": 0.206858292222023, "learning_rate": 0.001, "loss": 1.8833, "step": 477100 }, { "epoch": 27.911329473006962, "grad_norm": 0.19519180059432983, "learning_rate": 0.001, "loss": 1.8828, "step": 477200 }, { "epoch": 27.917178452360062, "grad_norm": 0.20129498839378357, "learning_rate": 0.001, "loss": 1.8836, "step": 477300 }, { "epoch": 27.923027431713166, "grad_norm": 0.2434065043926239, "learning_rate": 0.001, "loss": 1.8847, "step": 477400 }, { "epoch": 27.92887641106627, "grad_norm": 0.3026157021522522, "learning_rate": 0.001, "loss": 1.8855, "step": 477500 }, { "epoch": 27.934725390419374, "grad_norm": 0.20320838689804077, "learning_rate": 0.001, "loss": 1.8817, "step": 477600 }, { "epoch": 27.940574369772474, "grad_norm": 0.25622913241386414, "learning_rate": 0.001, "loss": 1.88, "step": 477700 }, { "epoch": 27.946423349125578, "grad_norm": 0.17202672362327576, "learning_rate": 0.001, "loss": 1.8842, "step": 477800 }, { "epoch": 27.95227232847868, "grad_norm": 0.23393794894218445, "learning_rate": 0.001, "loss": 1.8818, "step": 477900 }, { "epoch": 27.95812130783178, "grad_norm": 0.24179910123348236, "learning_rate": 0.001, "loss": 1.8879, "step": 478000 }, { "epoch": 27.963970287184885, "grad_norm": 0.17681515216827393, "learning_rate": 0.001, "loss": 1.8831, "step": 478100 }, { "epoch": 27.96981926653799, "grad_norm": 0.2691887617111206, "learning_rate": 0.001, "loss": 1.8817, "step": 478200 }, { "epoch": 27.975668245891093, "grad_norm": 0.19096091389656067, "learning_rate": 0.001, "loss": 1.8834, "step": 478300 }, { "epoch": 27.981517225244193, "grad_norm": 0.28307294845581055, "learning_rate": 0.001, "loss": 1.8847, "step": 478400 }, { "epoch": 27.987366204597297, "grad_norm": 0.19702817499637604, "learning_rate": 0.001, "loss": 1.8783, "step": 478500 }, { "epoch": 27.9932151839504, "grad_norm": 0.2042251080274582, "learning_rate": 0.001, "loss": 1.8787, "step": 478600 }, { "epoch": 27.999064163303505, "grad_norm": 0.2302335947751999, "learning_rate": 0.001, "loss": 1.8824, "step": 478700 }, { "epoch": 28.004913142656605, "grad_norm": 0.22842887043952942, "learning_rate": 0.001, "loss": 1.8718, "step": 478800 }, { "epoch": 28.01076212200971, "grad_norm": 0.2297513335943222, "learning_rate": 0.001, "loss": 1.869, "step": 478900 }, { "epoch": 28.016611101362813, "grad_norm": 0.23963110148906708, "learning_rate": 0.001, "loss": 1.8661, "step": 479000 }, { "epoch": 28.022460080715916, "grad_norm": 0.21370384097099304, "learning_rate": 0.001, "loss": 1.869, "step": 479100 }, { "epoch": 28.028309060069017, "grad_norm": 0.2656705677509308, "learning_rate": 0.001, "loss": 1.8716, "step": 479200 }, { "epoch": 28.03415803942212, "grad_norm": 0.21110296249389648, "learning_rate": 0.001, "loss": 1.8656, "step": 479300 }, { "epoch": 28.040007018775224, "grad_norm": 0.17632365226745605, "learning_rate": 0.001, "loss": 1.8635, "step": 479400 }, { "epoch": 28.045855998128328, "grad_norm": 0.21574798226356506, "learning_rate": 0.001, "loss": 1.8734, "step": 479500 }, { "epoch": 28.051704977481428, "grad_norm": 0.20816287398338318, "learning_rate": 0.001, "loss": 1.873, "step": 479600 }, { "epoch": 28.057553956834532, "grad_norm": 0.19222977757453918, "learning_rate": 0.001, "loss": 1.8697, "step": 479700 }, { "epoch": 28.063402936187636, "grad_norm": 0.265281081199646, "learning_rate": 0.001, "loss": 1.8734, "step": 479800 }, { "epoch": 28.06925191554074, "grad_norm": 0.2081725299358368, "learning_rate": 0.001, "loss": 1.8689, "step": 479900 }, { "epoch": 28.07510089489384, "grad_norm": 0.18834972381591797, "learning_rate": 0.001, "loss": 1.8682, "step": 480000 }, { "epoch": 28.080949874246944, "grad_norm": 0.21089696884155273, "learning_rate": 0.001, "loss": 1.8685, "step": 480100 }, { "epoch": 28.086798853600047, "grad_norm": 0.20664429664611816, "learning_rate": 0.001, "loss": 1.8666, "step": 480200 }, { "epoch": 28.09264783295315, "grad_norm": 0.2303871363401413, "learning_rate": 0.001, "loss": 1.8731, "step": 480300 }, { "epoch": 28.09849681230625, "grad_norm": 0.1711670160293579, "learning_rate": 0.001, "loss": 1.8643, "step": 480400 }, { "epoch": 28.104345791659355, "grad_norm": 0.2345239669084549, "learning_rate": 0.001, "loss": 1.8679, "step": 480500 }, { "epoch": 28.11019477101246, "grad_norm": 0.24907009303569794, "learning_rate": 0.001, "loss": 1.874, "step": 480600 }, { "epoch": 28.116043750365563, "grad_norm": 0.21735255420207977, "learning_rate": 0.001, "loss": 1.8726, "step": 480700 }, { "epoch": 28.121892729718663, "grad_norm": 0.2283458411693573, "learning_rate": 0.001, "loss": 1.8694, "step": 480800 }, { "epoch": 28.127741709071767, "grad_norm": 0.20856624841690063, "learning_rate": 0.001, "loss": 1.8658, "step": 480900 }, { "epoch": 28.13359068842487, "grad_norm": 0.1965222805738449, "learning_rate": 0.001, "loss": 1.8687, "step": 481000 }, { "epoch": 28.13943966777797, "grad_norm": 0.22130751609802246, "learning_rate": 0.001, "loss": 1.8657, "step": 481100 }, { "epoch": 28.145288647131075, "grad_norm": 0.1703406274318695, "learning_rate": 0.001, "loss": 1.8797, "step": 481200 }, { "epoch": 28.15113762648418, "grad_norm": 0.22403469681739807, "learning_rate": 0.001, "loss": 1.8699, "step": 481300 }, { "epoch": 28.156986605837282, "grad_norm": 0.18081845343112946, "learning_rate": 0.001, "loss": 1.8699, "step": 481400 }, { "epoch": 28.162835585190383, "grad_norm": 0.2249055653810501, "learning_rate": 0.001, "loss": 1.8774, "step": 481500 }, { "epoch": 28.168684564543486, "grad_norm": 0.22796140611171722, "learning_rate": 0.001, "loss": 1.8714, "step": 481600 }, { "epoch": 28.17453354389659, "grad_norm": 0.21144279837608337, "learning_rate": 0.001, "loss": 1.8708, "step": 481700 }, { "epoch": 28.180382523249694, "grad_norm": 0.19019266963005066, "learning_rate": 0.001, "loss": 1.873, "step": 481800 }, { "epoch": 28.186231502602794, "grad_norm": 0.21886926889419556, "learning_rate": 0.001, "loss": 1.8768, "step": 481900 }, { "epoch": 28.192080481955898, "grad_norm": 0.1533755213022232, "learning_rate": 0.001, "loss": 1.8784, "step": 482000 }, { "epoch": 28.197929461309002, "grad_norm": 0.19887718558311462, "learning_rate": 0.001, "loss": 1.8656, "step": 482100 }, { "epoch": 28.203778440662106, "grad_norm": 0.20487943291664124, "learning_rate": 0.001, "loss": 1.8711, "step": 482200 }, { "epoch": 28.209627420015206, "grad_norm": 0.1921287626028061, "learning_rate": 0.001, "loss": 1.8702, "step": 482300 }, { "epoch": 28.21547639936831, "grad_norm": 0.26897042989730835, "learning_rate": 0.001, "loss": 1.8677, "step": 482400 }, { "epoch": 28.221325378721414, "grad_norm": 0.16249175369739532, "learning_rate": 0.001, "loss": 1.8822, "step": 482500 }, { "epoch": 28.227174358074517, "grad_norm": 0.20060236752033234, "learning_rate": 0.001, "loss": 1.8737, "step": 482600 }, { "epoch": 28.233023337427618, "grad_norm": 0.20431147515773773, "learning_rate": 0.001, "loss": 1.8765, "step": 482700 }, { "epoch": 28.23887231678072, "grad_norm": 0.2111813724040985, "learning_rate": 0.001, "loss": 1.8784, "step": 482800 }, { "epoch": 28.244721296133825, "grad_norm": 0.21432727575302124, "learning_rate": 0.001, "loss": 1.8705, "step": 482900 }, { "epoch": 28.25057027548693, "grad_norm": 0.26268163323402405, "learning_rate": 0.001, "loss": 1.8729, "step": 483000 }, { "epoch": 28.25641925484003, "grad_norm": 0.20453935861587524, "learning_rate": 0.001, "loss": 1.8743, "step": 483100 }, { "epoch": 28.262268234193133, "grad_norm": 0.19738733768463135, "learning_rate": 0.001, "loss": 1.8756, "step": 483200 }, { "epoch": 28.268117213546237, "grad_norm": 0.2486942857503891, "learning_rate": 0.001, "loss": 1.8778, "step": 483300 }, { "epoch": 28.27396619289934, "grad_norm": 0.21163108944892883, "learning_rate": 0.001, "loss": 1.8729, "step": 483400 }, { "epoch": 28.27981517225244, "grad_norm": 0.2300024777650833, "learning_rate": 0.001, "loss": 1.883, "step": 483500 }, { "epoch": 28.285664151605545, "grad_norm": 0.2402220070362091, "learning_rate": 0.001, "loss": 1.8762, "step": 483600 }, { "epoch": 28.29151313095865, "grad_norm": 0.31386104226112366, "learning_rate": 0.001, "loss": 1.8791, "step": 483700 }, { "epoch": 28.297362110311752, "grad_norm": 0.21558767557144165, "learning_rate": 0.001, "loss": 1.872, "step": 483800 }, { "epoch": 28.303211089664853, "grad_norm": 0.20247569680213928, "learning_rate": 0.001, "loss": 1.8707, "step": 483900 }, { "epoch": 28.309060069017956, "grad_norm": 0.23199836909770966, "learning_rate": 0.001, "loss": 1.8738, "step": 484000 }, { "epoch": 28.31490904837106, "grad_norm": 0.22190755605697632, "learning_rate": 0.001, "loss": 1.8716, "step": 484100 }, { "epoch": 28.32075802772416, "grad_norm": 0.23344019055366516, "learning_rate": 0.001, "loss": 1.8777, "step": 484200 }, { "epoch": 28.326607007077264, "grad_norm": 0.24239444732666016, "learning_rate": 0.001, "loss": 1.8823, "step": 484300 }, { "epoch": 28.332455986430368, "grad_norm": 0.1945730745792389, "learning_rate": 0.001, "loss": 1.8776, "step": 484400 }, { "epoch": 28.338304965783472, "grad_norm": 0.162777379155159, "learning_rate": 0.001, "loss": 1.8736, "step": 484500 }, { "epoch": 28.344153945136572, "grad_norm": 0.21356192231178284, "learning_rate": 0.001, "loss": 1.8675, "step": 484600 }, { "epoch": 28.350002924489676, "grad_norm": 0.23204980790615082, "learning_rate": 0.001, "loss": 1.8784, "step": 484700 }, { "epoch": 28.35585190384278, "grad_norm": 0.1859487146139145, "learning_rate": 0.001, "loss": 1.8729, "step": 484800 }, { "epoch": 28.361700883195883, "grad_norm": 0.245778426527977, "learning_rate": 0.001, "loss": 1.8739, "step": 484900 }, { "epoch": 28.367549862548984, "grad_norm": 0.27269431948661804, "learning_rate": 0.001, "loss": 1.8761, "step": 485000 }, { "epoch": 28.373398841902087, "grad_norm": 0.21372152864933014, "learning_rate": 0.001, "loss": 1.8767, "step": 485100 }, { "epoch": 28.37924782125519, "grad_norm": 0.21586018800735474, "learning_rate": 0.001, "loss": 1.8738, "step": 485200 }, { "epoch": 28.385096800608295, "grad_norm": 0.20706501603126526, "learning_rate": 0.001, "loss": 1.8766, "step": 485300 }, { "epoch": 28.390945779961395, "grad_norm": 0.2571718096733093, "learning_rate": 0.001, "loss": 1.869, "step": 485400 }, { "epoch": 28.3967947593145, "grad_norm": 0.2281271368265152, "learning_rate": 0.001, "loss": 1.8748, "step": 485500 }, { "epoch": 28.402643738667603, "grad_norm": 0.22709298133850098, "learning_rate": 0.001, "loss": 1.8726, "step": 485600 }, { "epoch": 28.408492718020707, "grad_norm": 0.21493521332740784, "learning_rate": 0.001, "loss": 1.8757, "step": 485700 }, { "epoch": 28.414341697373807, "grad_norm": 0.18934513628482819, "learning_rate": 0.001, "loss": 1.8756, "step": 485800 }, { "epoch": 28.42019067672691, "grad_norm": 0.23048435151576996, "learning_rate": 0.001, "loss": 1.8742, "step": 485900 }, { "epoch": 28.426039656080015, "grad_norm": 0.1648053228855133, "learning_rate": 0.001, "loss": 1.8772, "step": 486000 }, { "epoch": 28.43188863543312, "grad_norm": 0.20938028395175934, "learning_rate": 0.001, "loss": 1.8732, "step": 486100 }, { "epoch": 28.43773761478622, "grad_norm": 0.2791036367416382, "learning_rate": 0.001, "loss": 1.8684, "step": 486200 }, { "epoch": 28.443586594139322, "grad_norm": 0.24590282142162323, "learning_rate": 0.001, "loss": 1.8748, "step": 486300 }, { "epoch": 28.449435573492426, "grad_norm": 0.2098827362060547, "learning_rate": 0.001, "loss": 1.8779, "step": 486400 }, { "epoch": 28.45528455284553, "grad_norm": 0.16063091158866882, "learning_rate": 0.001, "loss": 1.8742, "step": 486500 }, { "epoch": 28.46113353219863, "grad_norm": 0.16693256795406342, "learning_rate": 0.001, "loss": 1.8782, "step": 486600 }, { "epoch": 28.466982511551734, "grad_norm": 0.2302929162979126, "learning_rate": 0.001, "loss": 1.875, "step": 486700 }, { "epoch": 28.472831490904838, "grad_norm": 0.20931845903396606, "learning_rate": 0.001, "loss": 1.8695, "step": 486800 }, { "epoch": 28.47868047025794, "grad_norm": 0.19718532264232635, "learning_rate": 0.001, "loss": 1.8751, "step": 486900 }, { "epoch": 28.484529449611042, "grad_norm": 0.23256534337997437, "learning_rate": 0.001, "loss": 1.8763, "step": 487000 }, { "epoch": 28.490378428964146, "grad_norm": 0.18618661165237427, "learning_rate": 0.001, "loss": 1.8771, "step": 487100 }, { "epoch": 28.49622740831725, "grad_norm": 0.251801073551178, "learning_rate": 0.001, "loss": 1.8811, "step": 487200 }, { "epoch": 28.50207638767035, "grad_norm": 0.2160509079694748, "learning_rate": 0.001, "loss": 1.8736, "step": 487300 }, { "epoch": 28.507925367023454, "grad_norm": 0.22055603563785553, "learning_rate": 0.001, "loss": 1.8766, "step": 487400 }, { "epoch": 28.513774346376557, "grad_norm": 0.2663188874721527, "learning_rate": 0.001, "loss": 1.8729, "step": 487500 }, { "epoch": 28.51962332572966, "grad_norm": 0.2256724089384079, "learning_rate": 0.001, "loss": 1.8785, "step": 487600 }, { "epoch": 28.52547230508276, "grad_norm": 0.19969725608825684, "learning_rate": 0.001, "loss": 1.8786, "step": 487700 }, { "epoch": 28.531321284435865, "grad_norm": 0.2679445445537567, "learning_rate": 0.001, "loss": 1.8737, "step": 487800 }, { "epoch": 28.53717026378897, "grad_norm": 0.1990402191877365, "learning_rate": 0.001, "loss": 1.8771, "step": 487900 }, { "epoch": 28.543019243142073, "grad_norm": 0.17050053179264069, "learning_rate": 0.001, "loss": 1.8772, "step": 488000 }, { "epoch": 28.548868222495173, "grad_norm": 0.2057173103094101, "learning_rate": 0.001, "loss": 1.8778, "step": 488100 }, { "epoch": 28.554717201848277, "grad_norm": 0.23007121682167053, "learning_rate": 0.001, "loss": 1.875, "step": 488200 }, { "epoch": 28.56056618120138, "grad_norm": 0.23787382245063782, "learning_rate": 0.001, "loss": 1.8709, "step": 488300 }, { "epoch": 28.566415160554484, "grad_norm": 0.1969175785779953, "learning_rate": 0.001, "loss": 1.8783, "step": 488400 }, { "epoch": 28.572264139907585, "grad_norm": 0.1891854852437973, "learning_rate": 0.001, "loss": 1.8746, "step": 488500 }, { "epoch": 28.57811311926069, "grad_norm": 0.2059936374425888, "learning_rate": 0.001, "loss": 1.88, "step": 488600 }, { "epoch": 28.583962098613792, "grad_norm": 0.2230185568332672, "learning_rate": 0.001, "loss": 1.8805, "step": 488700 }, { "epoch": 28.589811077966896, "grad_norm": 0.2362312525510788, "learning_rate": 0.001, "loss": 1.8762, "step": 488800 }, { "epoch": 28.595660057319996, "grad_norm": 0.22580638527870178, "learning_rate": 0.001, "loss": 1.8756, "step": 488900 }, { "epoch": 28.6015090366731, "grad_norm": 0.24801218509674072, "learning_rate": 0.001, "loss": 1.8788, "step": 489000 }, { "epoch": 28.607358016026204, "grad_norm": 0.19014468789100647, "learning_rate": 0.001, "loss": 1.886, "step": 489100 }, { "epoch": 28.613206995379308, "grad_norm": 0.22484059631824493, "learning_rate": 0.001, "loss": 1.8812, "step": 489200 }, { "epoch": 28.619055974732408, "grad_norm": 0.22625219821929932, "learning_rate": 0.001, "loss": 1.8735, "step": 489300 }, { "epoch": 28.624904954085512, "grad_norm": 0.23376353085041046, "learning_rate": 0.001, "loss": 1.8819, "step": 489400 }, { "epoch": 28.630753933438616, "grad_norm": 0.24148833751678467, "learning_rate": 0.001, "loss": 1.8833, "step": 489500 }, { "epoch": 28.63660291279172, "grad_norm": 0.1881425976753235, "learning_rate": 0.001, "loss": 1.8721, "step": 489600 }, { "epoch": 28.64245189214482, "grad_norm": 0.23357528448104858, "learning_rate": 0.001, "loss": 1.8745, "step": 489700 }, { "epoch": 28.648300871497923, "grad_norm": 0.19447804987430573, "learning_rate": 0.001, "loss": 1.8772, "step": 489800 }, { "epoch": 28.654149850851027, "grad_norm": 0.28190988302230835, "learning_rate": 0.001, "loss": 1.8774, "step": 489900 }, { "epoch": 28.65999883020413, "grad_norm": 0.19037848711013794, "learning_rate": 0.001, "loss": 1.8735, "step": 490000 }, { "epoch": 28.66584780955723, "grad_norm": 0.24624085426330566, "learning_rate": 0.001, "loss": 1.8776, "step": 490100 }, { "epoch": 28.671696788910335, "grad_norm": 0.22527943551540375, "learning_rate": 0.001, "loss": 1.8807, "step": 490200 }, { "epoch": 28.67754576826344, "grad_norm": 0.19676972925662994, "learning_rate": 0.001, "loss": 1.8768, "step": 490300 }, { "epoch": 28.68339474761654, "grad_norm": 0.20620854198932648, "learning_rate": 0.001, "loss": 1.8801, "step": 490400 }, { "epoch": 28.689243726969643, "grad_norm": 0.2045167237520218, "learning_rate": 0.001, "loss": 1.8774, "step": 490500 }, { "epoch": 28.695092706322747, "grad_norm": 0.19844463467597961, "learning_rate": 0.001, "loss": 1.8803, "step": 490600 }, { "epoch": 28.70094168567585, "grad_norm": 0.18502116203308105, "learning_rate": 0.001, "loss": 1.8778, "step": 490700 }, { "epoch": 28.70679066502895, "grad_norm": 0.20743052661418915, "learning_rate": 0.001, "loss": 1.8766, "step": 490800 }, { "epoch": 28.712639644382055, "grad_norm": 0.18659907579421997, "learning_rate": 0.001, "loss": 1.8792, "step": 490900 }, { "epoch": 28.71848862373516, "grad_norm": 0.20364068448543549, "learning_rate": 0.001, "loss": 1.8772, "step": 491000 }, { "epoch": 28.724337603088262, "grad_norm": 0.1746264398097992, "learning_rate": 0.001, "loss": 1.8784, "step": 491100 }, { "epoch": 28.730186582441362, "grad_norm": 0.17244058847427368, "learning_rate": 0.001, "loss": 1.8771, "step": 491200 }, { "epoch": 28.736035561794466, "grad_norm": 0.2627560794353485, "learning_rate": 0.001, "loss": 1.879, "step": 491300 }, { "epoch": 28.74188454114757, "grad_norm": 0.200315460562706, "learning_rate": 0.001, "loss": 1.8798, "step": 491400 }, { "epoch": 28.747733520500674, "grad_norm": 0.1809065192937851, "learning_rate": 0.001, "loss": 1.8777, "step": 491500 }, { "epoch": 28.753582499853774, "grad_norm": 0.24490797519683838, "learning_rate": 0.001, "loss": 1.8826, "step": 491600 }, { "epoch": 28.759431479206878, "grad_norm": 0.21077874302864075, "learning_rate": 0.001, "loss": 1.8792, "step": 491700 }, { "epoch": 28.76528045855998, "grad_norm": 0.226949080824852, "learning_rate": 0.001, "loss": 1.877, "step": 491800 }, { "epoch": 28.771129437913086, "grad_norm": 0.20004650950431824, "learning_rate": 0.001, "loss": 1.875, "step": 491900 }, { "epoch": 28.776978417266186, "grad_norm": 0.24200063943862915, "learning_rate": 0.001, "loss": 1.8787, "step": 492000 }, { "epoch": 28.78282739661929, "grad_norm": 0.20203760266304016, "learning_rate": 0.001, "loss": 1.8804, "step": 492100 }, { "epoch": 28.788676375972393, "grad_norm": 0.23660892248153687, "learning_rate": 0.001, "loss": 1.8824, "step": 492200 }, { "epoch": 28.794525355325497, "grad_norm": 0.23294329643249512, "learning_rate": 0.001, "loss": 1.8757, "step": 492300 }, { "epoch": 28.800374334678597, "grad_norm": 0.2330494374036789, "learning_rate": 0.001, "loss": 1.8776, "step": 492400 }, { "epoch": 28.8062233140317, "grad_norm": 0.22831761837005615, "learning_rate": 0.001, "loss": 1.8783, "step": 492500 }, { "epoch": 28.812072293384805, "grad_norm": 0.22069697082042694, "learning_rate": 0.001, "loss": 1.8831, "step": 492600 }, { "epoch": 28.81792127273791, "grad_norm": 0.25923073291778564, "learning_rate": 0.001, "loss": 1.8832, "step": 492700 }, { "epoch": 28.82377025209101, "grad_norm": 0.18190747499465942, "learning_rate": 0.001, "loss": 1.8733, "step": 492800 }, { "epoch": 28.829619231444113, "grad_norm": 0.21025951206684113, "learning_rate": 0.001, "loss": 1.8811, "step": 492900 }, { "epoch": 28.835468210797217, "grad_norm": 0.20760640501976013, "learning_rate": 0.001, "loss": 1.8773, "step": 493000 }, { "epoch": 28.84131719015032, "grad_norm": 0.20154963433742523, "learning_rate": 0.001, "loss": 1.879, "step": 493100 }, { "epoch": 28.84716616950342, "grad_norm": 0.25512248277664185, "learning_rate": 0.001, "loss": 1.8793, "step": 493200 }, { "epoch": 28.853015148856525, "grad_norm": 0.1869526356458664, "learning_rate": 0.001, "loss": 1.8756, "step": 493300 }, { "epoch": 28.85886412820963, "grad_norm": 0.18224957585334778, "learning_rate": 0.001, "loss": 1.8767, "step": 493400 }, { "epoch": 28.86471310756273, "grad_norm": 0.20648866891860962, "learning_rate": 0.001, "loss": 1.8798, "step": 493500 }, { "epoch": 28.870562086915832, "grad_norm": 0.2264067381620407, "learning_rate": 0.001, "loss": 1.8834, "step": 493600 }, { "epoch": 28.876411066268936, "grad_norm": 0.2439790964126587, "learning_rate": 0.001, "loss": 1.8811, "step": 493700 }, { "epoch": 28.88226004562204, "grad_norm": 0.2053464651107788, "learning_rate": 0.001, "loss": 1.8821, "step": 493800 }, { "epoch": 28.88810902497514, "grad_norm": 0.21288953721523285, "learning_rate": 0.001, "loss": 1.882, "step": 493900 }, { "epoch": 28.893958004328244, "grad_norm": 0.2635190784931183, "learning_rate": 0.001, "loss": 1.878, "step": 494000 }, { "epoch": 28.899806983681348, "grad_norm": 0.2136104851961136, "learning_rate": 0.001, "loss": 1.8756, "step": 494100 }, { "epoch": 28.90565596303445, "grad_norm": 0.20899264514446259, "learning_rate": 0.001, "loss": 1.8815, "step": 494200 }, { "epoch": 28.911504942387552, "grad_norm": 0.2199202924966812, "learning_rate": 0.001, "loss": 1.8786, "step": 494300 }, { "epoch": 28.917353921740656, "grad_norm": 0.18952558934688568, "learning_rate": 0.001, "loss": 1.8857, "step": 494400 }, { "epoch": 28.92320290109376, "grad_norm": 0.253278523683548, "learning_rate": 0.001, "loss": 1.8775, "step": 494500 }, { "epoch": 28.929051880446863, "grad_norm": 0.20746669173240662, "learning_rate": 0.001, "loss": 1.88, "step": 494600 }, { "epoch": 28.934900859799964, "grad_norm": 0.2037365436553955, "learning_rate": 0.001, "loss": 1.8818, "step": 494700 }, { "epoch": 28.940749839153067, "grad_norm": 0.23057492077350616, "learning_rate": 0.001, "loss": 1.8808, "step": 494800 }, { "epoch": 28.94659881850617, "grad_norm": 0.1882539689540863, "learning_rate": 0.001, "loss": 1.8791, "step": 494900 }, { "epoch": 28.952447797859275, "grad_norm": 0.21075429022312164, "learning_rate": 0.001, "loss": 1.8832, "step": 495000 }, { "epoch": 28.958296777212375, "grad_norm": 0.2199152559041977, "learning_rate": 0.001, "loss": 1.8824, "step": 495100 }, { "epoch": 28.96414575656548, "grad_norm": 0.20969773828983307, "learning_rate": 0.001, "loss": 1.8771, "step": 495200 }, { "epoch": 28.969994735918583, "grad_norm": 0.22690917551517487, "learning_rate": 0.001, "loss": 1.8758, "step": 495300 }, { "epoch": 28.975843715271687, "grad_norm": 0.20665526390075684, "learning_rate": 0.001, "loss": 1.8814, "step": 495400 }, { "epoch": 28.981692694624787, "grad_norm": 0.2508780062198639, "learning_rate": 0.001, "loss": 1.8797, "step": 495500 }, { "epoch": 28.98754167397789, "grad_norm": 0.1873021423816681, "learning_rate": 0.001, "loss": 1.8795, "step": 495600 }, { "epoch": 28.993390653330994, "grad_norm": 0.21614447236061096, "learning_rate": 0.001, "loss": 1.8789, "step": 495700 }, { "epoch": 28.999239632684098, "grad_norm": 0.2170064002275467, "learning_rate": 0.001, "loss": 1.8821, "step": 495800 }, { "epoch": 29.0050886120372, "grad_norm": 0.24333111941814423, "learning_rate": 0.001, "loss": 1.862, "step": 495900 }, { "epoch": 29.010937591390302, "grad_norm": 0.2503686249256134, "learning_rate": 0.001, "loss": 1.8624, "step": 496000 }, { "epoch": 29.016786570743406, "grad_norm": 0.25092029571533203, "learning_rate": 0.001, "loss": 1.865, "step": 496100 }, { "epoch": 29.02263555009651, "grad_norm": 0.22663013637065887, "learning_rate": 0.001, "loss": 1.8674, "step": 496200 }, { "epoch": 29.02848452944961, "grad_norm": 0.2995929419994354, "learning_rate": 0.001, "loss": 1.8619, "step": 496300 }, { "epoch": 29.034333508802714, "grad_norm": 0.2715704143047333, "learning_rate": 0.001, "loss": 1.8643, "step": 496400 }, { "epoch": 29.040182488155818, "grad_norm": 0.22250114381313324, "learning_rate": 0.001, "loss": 1.863, "step": 496500 }, { "epoch": 29.046031467508918, "grad_norm": 0.22177545726299286, "learning_rate": 0.001, "loss": 1.8659, "step": 496600 }, { "epoch": 29.05188044686202, "grad_norm": 0.24951279163360596, "learning_rate": 0.001, "loss": 1.864, "step": 496700 }, { "epoch": 29.057729426215126, "grad_norm": 0.2217448502779007, "learning_rate": 0.001, "loss": 1.865, "step": 496800 }, { "epoch": 29.06357840556823, "grad_norm": 0.27842968702316284, "learning_rate": 0.001, "loss": 1.8653, "step": 496900 }, { "epoch": 29.06942738492133, "grad_norm": 0.1850174218416214, "learning_rate": 0.001, "loss": 1.8653, "step": 497000 }, { "epoch": 29.075276364274433, "grad_norm": 0.20843642950057983, "learning_rate": 0.001, "loss": 1.8644, "step": 497100 }, { "epoch": 29.081125343627537, "grad_norm": 0.21980629861354828, "learning_rate": 0.001, "loss": 1.8631, "step": 497200 }, { "epoch": 29.08697432298064, "grad_norm": 0.22724631428718567, "learning_rate": 0.001, "loss": 1.8674, "step": 497300 }, { "epoch": 29.09282330233374, "grad_norm": 0.2570313513278961, "learning_rate": 0.001, "loss": 1.8741, "step": 497400 }, { "epoch": 29.098672281686845, "grad_norm": 0.22831377387046814, "learning_rate": 0.001, "loss": 1.8684, "step": 497500 }, { "epoch": 29.10452126103995, "grad_norm": 0.2638630270957947, "learning_rate": 0.001, "loss": 1.872, "step": 497600 }, { "epoch": 29.110370240393053, "grad_norm": 0.24325834214687347, "learning_rate": 0.001, "loss": 1.8666, "step": 497700 }, { "epoch": 29.116219219746153, "grad_norm": 0.25414979457855225, "learning_rate": 0.001, "loss": 1.8709, "step": 497800 }, { "epoch": 29.122068199099257, "grad_norm": 0.26389291882514954, "learning_rate": 0.001, "loss": 1.8686, "step": 497900 }, { "epoch": 29.12791717845236, "grad_norm": 0.2324562668800354, "learning_rate": 0.001, "loss": 1.8648, "step": 498000 }, { "epoch": 29.133766157805464, "grad_norm": 0.22303645312786102, "learning_rate": 0.001, "loss": 1.873, "step": 498100 }, { "epoch": 29.139615137158565, "grad_norm": 0.21762141585350037, "learning_rate": 0.001, "loss": 1.8726, "step": 498200 }, { "epoch": 29.14546411651167, "grad_norm": 0.23987418413162231, "learning_rate": 0.001, "loss": 1.8668, "step": 498300 }, { "epoch": 29.151313095864772, "grad_norm": 0.21518918871879578, "learning_rate": 0.001, "loss": 1.8684, "step": 498400 }, { "epoch": 29.157162075217876, "grad_norm": 0.22903303802013397, "learning_rate": 0.001, "loss": 1.8674, "step": 498500 }, { "epoch": 29.163011054570976, "grad_norm": 0.26850777864456177, "learning_rate": 0.001, "loss": 1.8629, "step": 498600 }, { "epoch": 29.16886003392408, "grad_norm": 0.2501225471496582, "learning_rate": 0.001, "loss": 1.8706, "step": 498700 }, { "epoch": 29.174709013277184, "grad_norm": 0.253105103969574, "learning_rate": 0.001, "loss": 1.871, "step": 498800 }, { "epoch": 29.180557992630288, "grad_norm": 0.20729489624500275, "learning_rate": 0.001, "loss": 1.8723, "step": 498900 }, { "epoch": 29.186406971983388, "grad_norm": 0.25313377380371094, "learning_rate": 0.001, "loss": 1.8675, "step": 499000 }, { "epoch": 29.19225595133649, "grad_norm": 0.25151947140693665, "learning_rate": 0.001, "loss": 1.8624, "step": 499100 }, { "epoch": 29.198104930689595, "grad_norm": 0.19515396654605865, "learning_rate": 0.001, "loss": 1.8671, "step": 499200 }, { "epoch": 29.2039539100427, "grad_norm": 0.2764059603214264, "learning_rate": 0.001, "loss": 1.8705, "step": 499300 }, { "epoch": 29.2098028893958, "grad_norm": 0.22443656623363495, "learning_rate": 0.001, "loss": 1.87, "step": 499400 }, { "epoch": 29.215651868748903, "grad_norm": 0.2510312795639038, "learning_rate": 0.001, "loss": 1.8653, "step": 499500 }, { "epoch": 29.221500848102007, "grad_norm": 0.21136803925037384, "learning_rate": 0.001, "loss": 1.8697, "step": 499600 }, { "epoch": 29.227349827455107, "grad_norm": 0.2559882402420044, "learning_rate": 0.001, "loss": 1.8688, "step": 499700 }, { "epoch": 29.23319880680821, "grad_norm": 0.2707280218601227, "learning_rate": 0.001, "loss": 1.8702, "step": 499800 }, { "epoch": 29.239047786161315, "grad_norm": 0.2747200131416321, "learning_rate": 0.001, "loss": 1.8678, "step": 499900 }, { "epoch": 29.24489676551442, "grad_norm": 0.24215379357337952, "learning_rate": 0.001, "loss": 1.8665, "step": 500000 }, { "epoch": 29.24489676551442, "eval_ag_news_accuracy": 0.240234375, "eval_ag_news_bleu_score": 7.252020925607862, "eval_ag_news_bleu_score_sem": 0.4104202816798245, "eval_ag_news_emb_cos_sim": 0.726897656917572, "eval_ag_news_emb_cos_sim_sem": 0.014064237475395203, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.699155330657959, "eval_ag_news_n_ngrams_match_1": 13.78125, "eval_ag_news_n_ngrams_match_2": 4.09375, "eval_ag_news_n_ngrams_match_3": 1.671875, "eval_ag_news_num_pred_words": 45.203125, "eval_ag_news_num_true_words": 43.8203125, "eval_ag_news_perplexity": 14.867168578266376, "eval_ag_news_pred_num_tokens": 67.296875, "eval_ag_news_rouge_score": 0.2915487247556534, "eval_ag_news_runtime": 39.0851, "eval_ag_news_samples_per_second": 12.793, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.33188632763258163, "eval_ag_news_token_set_f1_sem": 0.009250887107549477, "eval_ag_news_token_set_precision": 0.3156037812115838, "eval_ag_news_token_set_recall": 0.35989020712890957, "eval_ag_news_true_num_tokens": 61.15625, "step": 500000 }, { "epoch": 29.24489676551442, "eval_anthropic_toxic_prompts_accuracy": 0.103796875, "eval_anthropic_toxic_prompts_bleu_score": 43.71443909161776, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.69899126716645, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8946130275726318, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008428731933236122, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1328125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030114393430435732, "eval_anthropic_toxic_prompts_loss": 1.281551718711853, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.2578125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.765625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.875, "eval_anthropic_toxic_prompts_num_pred_words": 14.6171875, "eval_anthropic_toxic_prompts_num_true_words": 14.6328125, "eval_anthropic_toxic_prompts_perplexity": 3.6022250310220265, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.046875, "eval_anthropic_toxic_prompts_rouge_score": 0.6831356965907327, "eval_anthropic_toxic_prompts_runtime": 30.9314, "eval_anthropic_toxic_prompts_samples_per_second": 16.165, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.7015445265504785, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01916702228335252, "eval_anthropic_toxic_prompts_token_set_precision": 0.7066996618643253, "eval_anthropic_toxic_prompts_token_set_recall": 0.7056461057627359, "eval_anthropic_toxic_prompts_true_num_tokens": 17.9375, "step": 500000 }, { "epoch": 29.24489676551442, "eval_arxiv_accuracy": 0.369796875, "eval_arxiv_bleu_score": 1.7073166994176512, "eval_arxiv_bleu_score_sem": 0.16697042524325267, "eval_arxiv_emb_cos_sim": 0.4700468182563782, "eval_arxiv_emb_cos_sim_sem": 0.020406296476721764, "eval_arxiv_emb_top1_equal": 0.8671875, "eval_arxiv_emb_top1_equal_sem": 0.030114395543932915, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.459486246109009, "eval_arxiv_n_ngrams_match_1": 13.828125, "eval_arxiv_n_ngrams_match_2": 2.5390625, "eval_arxiv_n_ngrams_match_3": 0.546875, "eval_arxiv_num_pred_words": 53.390625, "eval_arxiv_num_true_words": 86.90625, "eval_arxiv_perplexity": 31.8006346173998, "eval_arxiv_pred_num_tokens": 125.46875, "eval_arxiv_rouge_score": 0.18697937507579143, "eval_arxiv_runtime": 31.1845, "eval_arxiv_samples_per_second": 16.034, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.1786249667492865, "eval_arxiv_token_set_f1_sem": 0.00819808372597023, "eval_arxiv_token_set_precision": 0.12202229422574658, "eval_arxiv_token_set_recall": 0.41082039675546983, "eval_arxiv_true_num_tokens": 124.203125, "step": 500000 }, { "epoch": 29.24489676551442, "eval_python_code_alpaca_accuracy": 0.128515625, "eval_python_code_alpaca_bleu_score": 29.12013592044859, "eval_python_code_alpaca_bleu_score_sem": 1.686484946986365, "eval_python_code_alpaca_emb_cos_sim": 0.8640813827514648, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009956685826182365, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.5054986476898193, "eval_python_code_alpaca_n_ngrams_match_1": 10.328125, "eval_python_code_alpaca_n_ngrams_match_2": 5.828125, "eval_python_code_alpaca_n_ngrams_match_3": 3.4296875, "eval_python_code_alpaca_num_pred_words": 17.078125, "eval_python_code_alpaca_num_true_words": 18.0625, "eval_python_code_alpaca_perplexity": 4.506400176161724, "eval_python_code_alpaca_pred_num_tokens": 23.3125, "eval_python_code_alpaca_rouge_score": 0.6080302066164143, "eval_python_code_alpaca_runtime": 29.8195, "eval_python_code_alpaca_samples_per_second": 16.768, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6283927594972486, "eval_python_code_alpaca_token_set_f1_sem": 0.013760812275148318, "eval_python_code_alpaca_token_set_precision": 0.61420486031574, "eval_python_code_alpaca_token_set_recall": 0.6516619389372625, "eval_python_code_alpaca_true_num_tokens": 23.859375, "step": 500000 }, { "epoch": 29.24489676551442, "eval_wikibio_accuracy": 0.3651875, "eval_wikibio_bleu_score": 8.29224873448715, "eval_wikibio_bleu_score_sem": 0.7919121687492006, "eval_wikibio_emb_cos_sim": 0.596114993095398, "eval_wikibio_emb_cos_sim_sem": 0.023760385811328888, "eval_wikibio_emb_top1_equal": 0.921875, "eval_wikibio_emb_top1_equal_sem": 0.023813825100660324, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.754192352294922, "eval_wikibio_n_ngrams_match_1": 15.796875, "eval_wikibio_n_ngrams_match_2": 5.5, "eval_wikibio_n_ngrams_match_3": 2.3203125, "eval_wikibio_num_pred_words": 53.484375, "eval_wikibio_num_true_words": 51.625, "eval_wikibio_perplexity": 15.708348966193654, "eval_wikibio_pred_num_tokens": 103.4609375, "eval_wikibio_rouge_score": 0.3119954834873896, "eval_wikibio_runtime": 30.724, "eval_wikibio_samples_per_second": 16.274, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.3392416296632608, "eval_wikibio_token_set_f1_sem": 0.01305898581766949, "eval_wikibio_token_set_precision": 0.30394385203807134, "eval_wikibio_token_set_recall": 0.4098563553005365, "eval_wikibio_true_num_tokens": 97.6328125, "step": 500000 }, { "epoch": 29.24489676551442, "eval_msmarco_accuracy": 0.391625, "eval_msmarco_bleu_score": 17.934038745636357, "eval_msmarco_bleu_score_sem": 1.6330666887405356, "eval_msmarco_emb_cos_sim": 0.7794125080108643, "eval_msmarco_emb_cos_sim_sem": 0.018111633136868477, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7750608921051025, "eval_msmarco_n_ngrams_match_1": 28.765625, "eval_msmarco_n_ngrams_match_2": 13.328125, "eval_msmarco_n_ngrams_match_3": 8.09375, "eval_msmarco_num_pred_words": 62.28125, "eval_msmarco_num_true_words": 63.1171875, "eval_msmarco_perplexity": 5.900640427797, "eval_msmarco_pred_num_tokens": 83.5390625, "eval_msmarco_rouge_score": 0.44669274203263826, "eval_msmarco_runtime": 26.111, "eval_msmarco_samples_per_second": 19.149, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.4679072318894953, "eval_msmarco_token_set_f1_sem": 0.014753859393474386, "eval_msmarco_token_set_precision": 0.4303415402472298, "eval_msmarco_token_set_recall": 0.5309560366167763, "eval_msmarco_true_num_tokens": 80.2890625, "step": 500000 }, { "epoch": 29.25074574486752, "grad_norm": 0.25659823417663574, "learning_rate": 0.001, "loss": 1.8678, "step": 500100 }, { "epoch": 29.256594724220623, "grad_norm": 0.2622469663619995, "learning_rate": 0.001, "loss": 1.87, "step": 500200 }, { "epoch": 29.262443703573727, "grad_norm": 0.24334579706192017, "learning_rate": 0.001, "loss": 1.8675, "step": 500300 }, { "epoch": 29.26829268292683, "grad_norm": 0.20271466672420502, "learning_rate": 0.001, "loss": 1.8719, "step": 500400 }, { "epoch": 29.27414166227993, "grad_norm": 0.2598907947540283, "learning_rate": 0.001, "loss": 1.8734, "step": 500500 }, { "epoch": 29.279990641633034, "grad_norm": 0.2936217486858368, "learning_rate": 0.001, "loss": 1.8707, "step": 500600 }, { "epoch": 29.28583962098614, "grad_norm": 0.2501504421234131, "learning_rate": 0.001, "loss": 1.8736, "step": 500700 }, { "epoch": 29.291688600339242, "grad_norm": 0.20376992225646973, "learning_rate": 0.001, "loss": 1.8766, "step": 500800 }, { "epoch": 29.297537579692342, "grad_norm": 0.27114173769950867, "learning_rate": 0.001, "loss": 1.8672, "step": 500900 }, { "epoch": 29.303386559045446, "grad_norm": 0.2047877311706543, "learning_rate": 0.001, "loss": 1.8693, "step": 501000 }, { "epoch": 29.30923553839855, "grad_norm": 0.24847443401813507, "learning_rate": 0.001, "loss": 1.8719, "step": 501100 }, { "epoch": 29.315084517751654, "grad_norm": 0.2612980604171753, "learning_rate": 0.001, "loss": 1.8687, "step": 501200 }, { "epoch": 29.320933497104754, "grad_norm": 0.22723615169525146, "learning_rate": 0.001, "loss": 1.8641, "step": 501300 }, { "epoch": 29.326782476457858, "grad_norm": 0.22445954382419586, "learning_rate": 0.001, "loss": 1.8741, "step": 501400 }, { "epoch": 29.33263145581096, "grad_norm": 0.20436732470989227, "learning_rate": 0.001, "loss": 1.8708, "step": 501500 }, { "epoch": 29.338480435164065, "grad_norm": 0.21741943061351776, "learning_rate": 0.001, "loss": 1.8745, "step": 501600 }, { "epoch": 29.344329414517166, "grad_norm": 0.26316335797309875, "learning_rate": 0.001, "loss": 1.8681, "step": 501700 }, { "epoch": 29.35017839387027, "grad_norm": 0.2629720866680145, "learning_rate": 0.001, "loss": 1.8688, "step": 501800 }, { "epoch": 29.356027373223373, "grad_norm": 0.2448149025440216, "learning_rate": 0.001, "loss": 1.8619, "step": 501900 }, { "epoch": 29.361876352576477, "grad_norm": 0.23319515585899353, "learning_rate": 0.001, "loss": 1.8674, "step": 502000 }, { "epoch": 29.367725331929577, "grad_norm": 0.27076807618141174, "learning_rate": 0.001, "loss": 1.8708, "step": 502100 }, { "epoch": 29.37357431128268, "grad_norm": 0.21658629179000854, "learning_rate": 0.001, "loss": 1.8643, "step": 502200 }, { "epoch": 29.379423290635785, "grad_norm": 0.2682393789291382, "learning_rate": 0.001, "loss": 1.8688, "step": 502300 }, { "epoch": 29.38527226998889, "grad_norm": 0.2393156886100769, "learning_rate": 0.001, "loss": 1.8779, "step": 502400 }, { "epoch": 29.39112124934199, "grad_norm": 0.27264997363090515, "learning_rate": 0.001, "loss": 1.8713, "step": 502500 }, { "epoch": 29.396970228695093, "grad_norm": 0.21837611496448517, "learning_rate": 0.001, "loss": 1.8674, "step": 502600 }, { "epoch": 29.402819208048196, "grad_norm": 0.26588174700737, "learning_rate": 0.001, "loss": 1.8668, "step": 502700 }, { "epoch": 29.408668187401297, "grad_norm": 0.19721713662147522, "learning_rate": 0.001, "loss": 1.8716, "step": 502800 }, { "epoch": 29.4145171667544, "grad_norm": 0.23809264600276947, "learning_rate": 0.001, "loss": 1.8776, "step": 502900 }, { "epoch": 29.420366146107504, "grad_norm": 0.21647077798843384, "learning_rate": 0.001, "loss": 1.8701, "step": 503000 }, { "epoch": 29.426215125460608, "grad_norm": 0.234223872423172, "learning_rate": 0.001, "loss": 1.8747, "step": 503100 }, { "epoch": 29.43206410481371, "grad_norm": 0.19942119717597961, "learning_rate": 0.001, "loss": 1.8726, "step": 503200 }, { "epoch": 29.437913084166812, "grad_norm": 0.27736738324165344, "learning_rate": 0.001, "loss": 1.8758, "step": 503300 }, { "epoch": 29.443762063519916, "grad_norm": 0.23831529915332794, "learning_rate": 0.001, "loss": 1.8788, "step": 503400 }, { "epoch": 29.44961104287302, "grad_norm": 0.2526850998401642, "learning_rate": 0.001, "loss": 1.8685, "step": 503500 }, { "epoch": 29.45546002222612, "grad_norm": 0.23836596310138702, "learning_rate": 0.001, "loss": 1.8752, "step": 503600 }, { "epoch": 29.461309001579224, "grad_norm": 0.22553962469100952, "learning_rate": 0.001, "loss": 1.8705, "step": 503700 }, { "epoch": 29.467157980932328, "grad_norm": 0.2371017187833786, "learning_rate": 0.001, "loss": 1.8792, "step": 503800 }, { "epoch": 29.47300696028543, "grad_norm": 0.21364162862300873, "learning_rate": 0.001, "loss": 1.8685, "step": 503900 }, { "epoch": 29.47885593963853, "grad_norm": 0.2171664535999298, "learning_rate": 0.001, "loss": 1.8692, "step": 504000 }, { "epoch": 29.484704918991635, "grad_norm": 0.23483923077583313, "learning_rate": 0.001, "loss": 1.8732, "step": 504100 }, { "epoch": 29.49055389834474, "grad_norm": 0.3319283425807953, "learning_rate": 0.001, "loss": 1.8698, "step": 504200 }, { "epoch": 29.496402877697843, "grad_norm": 0.31401288509368896, "learning_rate": 0.001, "loss": 1.8713, "step": 504300 }, { "epoch": 29.502251857050943, "grad_norm": 0.2400626838207245, "learning_rate": 0.001, "loss": 1.8712, "step": 504400 }, { "epoch": 29.508100836404047, "grad_norm": 0.2014325112104416, "learning_rate": 0.001, "loss": 1.869, "step": 504500 }, { "epoch": 29.51394981575715, "grad_norm": 0.2737313210964203, "learning_rate": 0.001, "loss": 1.8729, "step": 504600 }, { "epoch": 29.519798795110255, "grad_norm": 0.2906436324119568, "learning_rate": 0.001, "loss": 1.8729, "step": 504700 }, { "epoch": 29.525647774463355, "grad_norm": 0.1883103847503662, "learning_rate": 0.001, "loss": 1.877, "step": 504800 }, { "epoch": 29.53149675381646, "grad_norm": 0.2905609607696533, "learning_rate": 0.001, "loss": 1.8691, "step": 504900 }, { "epoch": 29.537345733169563, "grad_norm": 0.21960799396038055, "learning_rate": 0.001, "loss": 1.8712, "step": 505000 }, { "epoch": 29.543194712522666, "grad_norm": 0.24592922627925873, "learning_rate": 0.001, "loss": 1.8748, "step": 505100 }, { "epoch": 29.549043691875767, "grad_norm": 0.2632077634334564, "learning_rate": 0.001, "loss": 1.8707, "step": 505200 }, { "epoch": 29.55489267122887, "grad_norm": 0.2804294526576996, "learning_rate": 0.001, "loss": 1.8724, "step": 505300 }, { "epoch": 29.560741650581974, "grad_norm": 0.2693489193916321, "learning_rate": 0.001, "loss": 1.8739, "step": 505400 }, { "epoch": 29.566590629935078, "grad_norm": 0.26907309889793396, "learning_rate": 0.001, "loss": 1.871, "step": 505500 }, { "epoch": 29.57243960928818, "grad_norm": 0.25074949860572815, "learning_rate": 0.001, "loss": 1.8705, "step": 505600 }, { "epoch": 29.578288588641282, "grad_norm": 0.2509981691837311, "learning_rate": 0.001, "loss": 1.8741, "step": 505700 }, { "epoch": 29.584137567994386, "grad_norm": 0.23839950561523438, "learning_rate": 0.001, "loss": 1.8736, "step": 505800 }, { "epoch": 29.589986547347486, "grad_norm": 0.2609831392765045, "learning_rate": 0.001, "loss": 1.8774, "step": 505900 }, { "epoch": 29.59583552670059, "grad_norm": 0.2500745356082916, "learning_rate": 0.001, "loss": 1.8667, "step": 506000 }, { "epoch": 29.601684506053694, "grad_norm": 0.314613938331604, "learning_rate": 0.001, "loss": 1.8743, "step": 506100 }, { "epoch": 29.607533485406798, "grad_norm": 0.3023127317428589, "learning_rate": 0.001, "loss": 1.8734, "step": 506200 }, { "epoch": 29.613382464759898, "grad_norm": 0.20769451558589935, "learning_rate": 0.001, "loss": 1.8697, "step": 506300 }, { "epoch": 29.619231444113, "grad_norm": 0.2876565158367157, "learning_rate": 0.001, "loss": 1.8729, "step": 506400 }, { "epoch": 29.625080423466105, "grad_norm": 0.20776091516017914, "learning_rate": 0.001, "loss": 1.8736, "step": 506500 }, { "epoch": 29.63092940281921, "grad_norm": 0.20880955457687378, "learning_rate": 0.001, "loss": 1.8741, "step": 506600 }, { "epoch": 29.63677838217231, "grad_norm": 0.2427014261484146, "learning_rate": 0.001, "loss": 1.8711, "step": 506700 }, { "epoch": 29.642627361525413, "grad_norm": 0.28721803426742554, "learning_rate": 0.001, "loss": 1.8731, "step": 506800 }, { "epoch": 29.648476340878517, "grad_norm": 0.20497581362724304, "learning_rate": 0.001, "loss": 1.8742, "step": 506900 }, { "epoch": 29.65432532023162, "grad_norm": 0.2695253789424896, "learning_rate": 0.001, "loss": 1.8742, "step": 507000 }, { "epoch": 29.66017429958472, "grad_norm": 0.25970467925071716, "learning_rate": 0.001, "loss": 1.8741, "step": 507100 }, { "epoch": 29.666023278937825, "grad_norm": 0.21350403130054474, "learning_rate": 0.001, "loss": 1.8731, "step": 507200 }, { "epoch": 29.67187225829093, "grad_norm": 0.2525138258934021, "learning_rate": 0.001, "loss": 1.8688, "step": 507300 }, { "epoch": 29.677721237644032, "grad_norm": 0.31685593724250793, "learning_rate": 0.001, "loss": 1.8757, "step": 507400 }, { "epoch": 29.683570216997133, "grad_norm": 0.20506474375724792, "learning_rate": 0.001, "loss": 1.8758, "step": 507500 }, { "epoch": 29.689419196350237, "grad_norm": 0.22578847408294678, "learning_rate": 0.001, "loss": 1.8741, "step": 507600 }, { "epoch": 29.69526817570334, "grad_norm": 0.21415653824806213, "learning_rate": 0.001, "loss": 1.8748, "step": 507700 }, { "epoch": 29.701117155056444, "grad_norm": 0.21987903118133545, "learning_rate": 0.001, "loss": 1.8738, "step": 507800 }, { "epoch": 29.706966134409544, "grad_norm": 0.2362530678510666, "learning_rate": 0.001, "loss": 1.8754, "step": 507900 }, { "epoch": 29.712815113762648, "grad_norm": 0.23734824359416962, "learning_rate": 0.001, "loss": 1.8739, "step": 508000 }, { "epoch": 29.718664093115752, "grad_norm": 0.21447546780109406, "learning_rate": 0.001, "loss": 1.8715, "step": 508100 }, { "epoch": 29.724513072468856, "grad_norm": 0.23532237112522125, "learning_rate": 0.001, "loss": 1.8737, "step": 508200 }, { "epoch": 29.730362051821956, "grad_norm": 0.3121623694896698, "learning_rate": 0.001, "loss": 1.8735, "step": 508300 }, { "epoch": 29.73621103117506, "grad_norm": 0.27387183904647827, "learning_rate": 0.001, "loss": 1.8737, "step": 508400 }, { "epoch": 29.742060010528164, "grad_norm": 0.25221794843673706, "learning_rate": 0.001, "loss": 1.8726, "step": 508500 }, { "epoch": 29.747908989881267, "grad_norm": 0.22159993648529053, "learning_rate": 0.001, "loss": 1.8729, "step": 508600 }, { "epoch": 29.753757969234368, "grad_norm": 0.27525001764297485, "learning_rate": 0.001, "loss": 1.8754, "step": 508700 }, { "epoch": 29.75960694858747, "grad_norm": 0.27680256962776184, "learning_rate": 0.001, "loss": 1.872, "step": 508800 }, { "epoch": 29.765455927940575, "grad_norm": 0.2854146659374237, "learning_rate": 0.001, "loss": 1.8738, "step": 508900 }, { "epoch": 29.771304907293676, "grad_norm": 0.25242140889167786, "learning_rate": 0.001, "loss": 1.8754, "step": 509000 }, { "epoch": 29.77715388664678, "grad_norm": 0.23549410700798035, "learning_rate": 0.001, "loss": 1.8722, "step": 509100 }, { "epoch": 29.783002865999883, "grad_norm": 0.24625897407531738, "learning_rate": 0.001, "loss": 1.8692, "step": 509200 }, { "epoch": 29.788851845352987, "grad_norm": 0.26700007915496826, "learning_rate": 0.001, "loss": 1.8709, "step": 509300 }, { "epoch": 29.794700824706087, "grad_norm": 0.21684113144874573, "learning_rate": 0.001, "loss": 1.8707, "step": 509400 }, { "epoch": 29.80054980405919, "grad_norm": 0.21042872965335846, "learning_rate": 0.001, "loss": 1.8783, "step": 509500 }, { "epoch": 29.806398783412295, "grad_norm": 0.2170437127351761, "learning_rate": 0.001, "loss": 1.873, "step": 509600 }, { "epoch": 29.8122477627654, "grad_norm": 0.18652014434337616, "learning_rate": 0.001, "loss": 1.8688, "step": 509700 }, { "epoch": 29.8180967421185, "grad_norm": 0.24533618986606598, "learning_rate": 0.001, "loss": 1.8797, "step": 509800 }, { "epoch": 29.823945721471603, "grad_norm": 0.2935948967933655, "learning_rate": 0.001, "loss": 1.8713, "step": 509900 }, { "epoch": 29.829794700824706, "grad_norm": 0.3016955852508545, "learning_rate": 0.001, "loss": 1.8716, "step": 510000 }, { "epoch": 29.83564368017781, "grad_norm": 0.299270898103714, "learning_rate": 0.001, "loss": 1.8727, "step": 510100 }, { "epoch": 29.84149265953091, "grad_norm": 0.2969864010810852, "learning_rate": 0.001, "loss": 1.8759, "step": 510200 }, { "epoch": 29.847341638884014, "grad_norm": 0.2049679458141327, "learning_rate": 0.001, "loss": 1.8709, "step": 510300 }, { "epoch": 29.853190618237118, "grad_norm": 0.2361070215702057, "learning_rate": 0.001, "loss": 1.8717, "step": 510400 }, { "epoch": 29.859039597590222, "grad_norm": 0.24300633370876312, "learning_rate": 0.001, "loss": 1.8744, "step": 510500 }, { "epoch": 29.864888576943322, "grad_norm": 0.31077611446380615, "learning_rate": 0.001, "loss": 1.8709, "step": 510600 }, { "epoch": 29.870737556296426, "grad_norm": 0.17479868233203888, "learning_rate": 0.001, "loss": 1.8725, "step": 510700 }, { "epoch": 29.87658653564953, "grad_norm": 0.23201604187488556, "learning_rate": 0.001, "loss": 1.8768, "step": 510800 }, { "epoch": 29.882435515002634, "grad_norm": 0.33346641063690186, "learning_rate": 0.001, "loss": 1.8714, "step": 510900 }, { "epoch": 29.888284494355734, "grad_norm": 0.19988906383514404, "learning_rate": 0.001, "loss": 1.879, "step": 511000 }, { "epoch": 29.894133473708838, "grad_norm": 0.20995266735553741, "learning_rate": 0.001, "loss": 1.8718, "step": 511100 }, { "epoch": 29.89998245306194, "grad_norm": 0.21461784839630127, "learning_rate": 0.001, "loss": 1.8695, "step": 511200 }, { "epoch": 29.905831432415045, "grad_norm": 0.2502298057079315, "learning_rate": 0.001, "loss": 1.8774, "step": 511300 }, { "epoch": 29.911680411768145, "grad_norm": 0.21484696865081787, "learning_rate": 0.001, "loss": 1.8749, "step": 511400 }, { "epoch": 29.91752939112125, "grad_norm": 0.2353731095790863, "learning_rate": 0.001, "loss": 1.8744, "step": 511500 }, { "epoch": 29.923378370474353, "grad_norm": 0.20660142600536346, "learning_rate": 0.001, "loss": 1.8752, "step": 511600 }, { "epoch": 29.929227349827457, "grad_norm": 0.22610974311828613, "learning_rate": 0.001, "loss": 1.8785, "step": 511700 }, { "epoch": 29.935076329180557, "grad_norm": 0.36025288701057434, "learning_rate": 0.001, "loss": 1.8798, "step": 511800 }, { "epoch": 29.94092530853366, "grad_norm": 0.23611773550510406, "learning_rate": 0.001, "loss": 1.8719, "step": 511900 }, { "epoch": 29.946774287886765, "grad_norm": 0.2243015617132187, "learning_rate": 0.001, "loss": 1.8717, "step": 512000 }, { "epoch": 29.952623267239865, "grad_norm": 0.22381268441677094, "learning_rate": 0.001, "loss": 1.8768, "step": 512100 }, { "epoch": 29.95847224659297, "grad_norm": 0.2587682604789734, "learning_rate": 0.001, "loss": 1.8676, "step": 512200 }, { "epoch": 29.964321225946073, "grad_norm": 0.24179542064666748, "learning_rate": 0.001, "loss": 1.8692, "step": 512300 }, { "epoch": 29.970170205299176, "grad_norm": 0.27339687943458557, "learning_rate": 0.001, "loss": 1.8769, "step": 512400 }, { "epoch": 29.976019184652277, "grad_norm": 0.20669865608215332, "learning_rate": 0.001, "loss": 1.8784, "step": 512500 }, { "epoch": 29.98186816400538, "grad_norm": 0.24833990633487701, "learning_rate": 0.001, "loss": 1.8748, "step": 512600 }, { "epoch": 29.987717143358484, "grad_norm": 0.16939271986484528, "learning_rate": 0.001, "loss": 1.8749, "step": 512700 }, { "epoch": 29.993566122711588, "grad_norm": 0.19927909970283508, "learning_rate": 0.001, "loss": 1.8783, "step": 512800 }, { "epoch": 29.999415102064688, "grad_norm": 0.20089197158813477, "learning_rate": 0.001, "loss": 1.8714, "step": 512900 }, { "epoch": 30.005264081417792, "grad_norm": 0.17103751003742218, "learning_rate": 0.001, "loss": 1.8571, "step": 513000 }, { "epoch": 30.011113060770896, "grad_norm": 0.26636114716529846, "learning_rate": 0.001, "loss": 1.8638, "step": 513100 }, { "epoch": 30.016962040124, "grad_norm": 0.31119024753570557, "learning_rate": 0.001, "loss": 1.8617, "step": 513200 }, { "epoch": 30.0228110194771, "grad_norm": 0.20125284790992737, "learning_rate": 0.001, "loss": 1.8614, "step": 513300 }, { "epoch": 30.028659998830204, "grad_norm": 0.2109362930059433, "learning_rate": 0.001, "loss": 1.8618, "step": 513400 }, { "epoch": 30.034508978183307, "grad_norm": 0.2447378933429718, "learning_rate": 0.001, "loss": 1.8572, "step": 513500 }, { "epoch": 30.04035795753641, "grad_norm": 0.22311492264270782, "learning_rate": 0.001, "loss": 1.8587, "step": 513600 }, { "epoch": 30.04620693688951, "grad_norm": 0.18867577612400055, "learning_rate": 0.001, "loss": 1.8551, "step": 513700 }, { "epoch": 30.052055916242615, "grad_norm": 0.25789883732795715, "learning_rate": 0.001, "loss": 1.8612, "step": 513800 }, { "epoch": 30.05790489559572, "grad_norm": 0.24453115463256836, "learning_rate": 0.001, "loss": 1.866, "step": 513900 }, { "epoch": 30.063753874948823, "grad_norm": 0.16185249388217926, "learning_rate": 0.001, "loss": 1.859, "step": 514000 }, { "epoch": 30.069602854301923, "grad_norm": 0.19616615772247314, "learning_rate": 0.001, "loss": 1.8664, "step": 514100 }, { "epoch": 30.075451833655027, "grad_norm": 0.1627509593963623, "learning_rate": 0.001, "loss": 1.8615, "step": 514200 }, { "epoch": 30.08130081300813, "grad_norm": 0.14696773886680603, "learning_rate": 0.001, "loss": 1.8537, "step": 514300 }, { "epoch": 30.087149792361235, "grad_norm": 0.2109704613685608, "learning_rate": 0.001, "loss": 1.8602, "step": 514400 }, { "epoch": 30.092998771714335, "grad_norm": 0.24397030472755432, "learning_rate": 0.001, "loss": 1.8671, "step": 514500 }, { "epoch": 30.09884775106744, "grad_norm": 0.15610086917877197, "learning_rate": 0.001, "loss": 1.8633, "step": 514600 }, { "epoch": 30.104696730420542, "grad_norm": 0.1650897115468979, "learning_rate": 0.001, "loss": 1.8608, "step": 514700 }, { "epoch": 30.110545709773646, "grad_norm": 0.21027550101280212, "learning_rate": 0.001, "loss": 1.8636, "step": 514800 }, { "epoch": 30.116394689126746, "grad_norm": 0.15799187123775482, "learning_rate": 0.001, "loss": 1.8601, "step": 514900 }, { "epoch": 30.12224366847985, "grad_norm": 0.15747754275798798, "learning_rate": 0.001, "loss": 1.8623, "step": 515000 }, { "epoch": 30.128092647832954, "grad_norm": 0.1363467574119568, "learning_rate": 0.001, "loss": 1.8638, "step": 515100 }, { "epoch": 30.133941627186054, "grad_norm": 0.14600898325443268, "learning_rate": 0.001, "loss": 1.87, "step": 515200 }, { "epoch": 30.139790606539158, "grad_norm": 0.20752282440662384, "learning_rate": 0.001, "loss": 1.8623, "step": 515300 }, { "epoch": 30.145639585892262, "grad_norm": 0.20795321464538574, "learning_rate": 0.001, "loss": 1.8647, "step": 515400 }, { "epoch": 30.151488565245366, "grad_norm": 0.2360784411430359, "learning_rate": 0.001, "loss": 1.8568, "step": 515500 }, { "epoch": 30.157337544598466, "grad_norm": 0.18359731137752533, "learning_rate": 0.001, "loss": 1.8605, "step": 515600 }, { "epoch": 30.16318652395157, "grad_norm": 0.19560052454471588, "learning_rate": 0.001, "loss": 1.8637, "step": 515700 }, { "epoch": 30.169035503304674, "grad_norm": 0.2546687722206116, "learning_rate": 0.001, "loss": 1.865, "step": 515800 }, { "epoch": 30.174884482657777, "grad_norm": 0.29650381207466125, "learning_rate": 0.001, "loss": 1.8668, "step": 515900 }, { "epoch": 30.180733462010878, "grad_norm": 0.16150863468647003, "learning_rate": 0.001, "loss": 1.8678, "step": 516000 }, { "epoch": 30.18658244136398, "grad_norm": 0.14145521819591522, "learning_rate": 0.001, "loss": 1.8617, "step": 516100 }, { "epoch": 30.192431420717085, "grad_norm": 0.19615358114242554, "learning_rate": 0.001, "loss": 1.8612, "step": 516200 }, { "epoch": 30.19828040007019, "grad_norm": 0.19270752370357513, "learning_rate": 0.001, "loss": 1.8656, "step": 516300 }, { "epoch": 30.20412937942329, "grad_norm": 0.20524942874908447, "learning_rate": 0.001, "loss": 1.8636, "step": 516400 }, { "epoch": 30.209978358776393, "grad_norm": 0.26866838335990906, "learning_rate": 0.001, "loss": 1.8596, "step": 516500 }, { "epoch": 30.215827338129497, "grad_norm": 0.248003289103508, "learning_rate": 0.001, "loss": 1.867, "step": 516600 }, { "epoch": 30.2216763174826, "grad_norm": 0.188087597489357, "learning_rate": 0.001, "loss": 1.8651, "step": 516700 }, { "epoch": 30.2275252968357, "grad_norm": 0.1665697544813156, "learning_rate": 0.001, "loss": 1.8622, "step": 516800 }, { "epoch": 30.233374276188805, "grad_norm": 0.22649317979812622, "learning_rate": 0.001, "loss": 1.8636, "step": 516900 }, { "epoch": 30.23922325554191, "grad_norm": 0.18825647234916687, "learning_rate": 0.001, "loss": 1.866, "step": 517000 }, { "epoch": 30.245072234895012, "grad_norm": 0.19930963218212128, "learning_rate": 0.001, "loss": 1.8615, "step": 517100 }, { "epoch": 30.250921214248113, "grad_norm": 0.1583077311515808, "learning_rate": 0.001, "loss": 1.8659, "step": 517200 }, { "epoch": 30.256770193601216, "grad_norm": 0.18969860672950745, "learning_rate": 0.001, "loss": 1.8662, "step": 517300 }, { "epoch": 30.26261917295432, "grad_norm": 0.22886204719543457, "learning_rate": 0.001, "loss": 1.8666, "step": 517400 }, { "epoch": 30.268468152307424, "grad_norm": 0.22588516771793365, "learning_rate": 0.001, "loss": 1.8687, "step": 517500 }, { "epoch": 30.274317131660524, "grad_norm": 0.2567221224308014, "learning_rate": 0.001, "loss": 1.8602, "step": 517600 }, { "epoch": 30.280166111013628, "grad_norm": 0.21692898869514465, "learning_rate": 0.001, "loss": 1.8624, "step": 517700 }, { "epoch": 30.286015090366732, "grad_norm": 0.17354880273342133, "learning_rate": 0.001, "loss": 1.8647, "step": 517800 }, { "epoch": 30.291864069719836, "grad_norm": 0.17118161916732788, "learning_rate": 0.001, "loss": 1.8657, "step": 517900 }, { "epoch": 30.297713049072936, "grad_norm": 0.1791420876979828, "learning_rate": 0.001, "loss": 1.8632, "step": 518000 }, { "epoch": 30.30356202842604, "grad_norm": 0.24304358661174774, "learning_rate": 0.001, "loss": 1.8633, "step": 518100 }, { "epoch": 30.309411007779143, "grad_norm": 0.16196201741695404, "learning_rate": 0.001, "loss": 1.8667, "step": 518200 }, { "epoch": 30.315259987132244, "grad_norm": 0.19364722073078156, "learning_rate": 0.001, "loss": 1.868, "step": 518300 }, { "epoch": 30.321108966485347, "grad_norm": 0.27038058638572693, "learning_rate": 0.001, "loss": 1.8678, "step": 518400 }, { "epoch": 30.32695794583845, "grad_norm": 0.23662886023521423, "learning_rate": 0.001, "loss": 1.8696, "step": 518500 }, { "epoch": 30.332806925191555, "grad_norm": 0.30182644724845886, "learning_rate": 0.001, "loss": 1.8686, "step": 518600 }, { "epoch": 30.338655904544655, "grad_norm": 0.22098469734191895, "learning_rate": 0.001, "loss": 1.8652, "step": 518700 }, { "epoch": 30.34450488389776, "grad_norm": 0.22113417088985443, "learning_rate": 0.001, "loss": 1.8654, "step": 518800 }, { "epoch": 30.350353863250863, "grad_norm": 0.18489128351211548, "learning_rate": 0.001, "loss": 1.8652, "step": 518900 }, { "epoch": 30.356202842603967, "grad_norm": 0.22031278908252716, "learning_rate": 0.001, "loss": 1.8629, "step": 519000 }, { "epoch": 30.362051821957067, "grad_norm": 0.13216504454612732, "learning_rate": 0.001, "loss": 1.8629, "step": 519100 }, { "epoch": 30.36790080131017, "grad_norm": 0.22281335294246674, "learning_rate": 0.001, "loss": 1.8651, "step": 519200 }, { "epoch": 30.373749780663275, "grad_norm": 0.2653961777687073, "learning_rate": 0.001, "loss": 1.8667, "step": 519300 }, { "epoch": 30.37959876001638, "grad_norm": 0.18528327345848083, "learning_rate": 0.001, "loss": 1.8671, "step": 519400 }, { "epoch": 30.38544773936948, "grad_norm": 0.19628305733203888, "learning_rate": 0.001, "loss": 1.8657, "step": 519500 }, { "epoch": 30.391296718722582, "grad_norm": 0.270380437374115, "learning_rate": 0.001, "loss": 1.8694, "step": 519600 }, { "epoch": 30.397145698075686, "grad_norm": 0.18178632855415344, "learning_rate": 0.001, "loss": 1.8672, "step": 519700 }, { "epoch": 30.40299467742879, "grad_norm": 0.17464764416217804, "learning_rate": 0.001, "loss": 1.8718, "step": 519800 }, { "epoch": 30.40884365678189, "grad_norm": 0.1787894070148468, "learning_rate": 0.001, "loss": 1.8683, "step": 519900 }, { "epoch": 30.414692636134994, "grad_norm": 0.24433763325214386, "learning_rate": 0.001, "loss": 1.8662, "step": 520000 }, { "epoch": 30.420541615488098, "grad_norm": 0.23291203379631042, "learning_rate": 0.001, "loss": 1.8613, "step": 520100 }, { "epoch": 30.4263905948412, "grad_norm": 0.20873618125915527, "learning_rate": 0.001, "loss": 1.8683, "step": 520200 }, { "epoch": 30.432239574194302, "grad_norm": 0.19212567806243896, "learning_rate": 0.001, "loss": 1.864, "step": 520300 }, { "epoch": 30.438088553547406, "grad_norm": 0.2163248211145401, "learning_rate": 0.001, "loss": 1.8652, "step": 520400 }, { "epoch": 30.44393753290051, "grad_norm": 0.20525258779525757, "learning_rate": 0.001, "loss": 1.8709, "step": 520500 }, { "epoch": 30.449786512253613, "grad_norm": 0.1827993243932724, "learning_rate": 0.001, "loss": 1.8691, "step": 520600 }, { "epoch": 30.455635491606714, "grad_norm": 0.2794228792190552, "learning_rate": 0.001, "loss": 1.8627, "step": 520700 }, { "epoch": 30.461484470959817, "grad_norm": 0.2152499556541443, "learning_rate": 0.001, "loss": 1.8729, "step": 520800 }, { "epoch": 30.46733345031292, "grad_norm": 0.1731654554605484, "learning_rate": 0.001, "loss": 1.8722, "step": 520900 }, { "epoch": 30.473182429666025, "grad_norm": 0.1957617998123169, "learning_rate": 0.001, "loss": 1.8731, "step": 521000 }, { "epoch": 30.479031409019125, "grad_norm": 0.17764785885810852, "learning_rate": 0.001, "loss": 1.8698, "step": 521100 }, { "epoch": 30.48488038837223, "grad_norm": 0.17036135494709015, "learning_rate": 0.001, "loss": 1.8757, "step": 521200 }, { "epoch": 30.490729367725333, "grad_norm": 0.20134010910987854, "learning_rate": 0.001, "loss": 1.868, "step": 521300 }, { "epoch": 30.496578347078433, "grad_norm": 0.23652799427509308, "learning_rate": 0.001, "loss": 1.8759, "step": 521400 }, { "epoch": 30.502427326431537, "grad_norm": 0.28618964552879333, "learning_rate": 0.001, "loss": 1.8763, "step": 521500 }, { "epoch": 30.50827630578464, "grad_norm": 0.183725506067276, "learning_rate": 0.001, "loss": 1.8747, "step": 521600 }, { "epoch": 30.514125285137744, "grad_norm": 0.20173749327659607, "learning_rate": 0.001, "loss": 1.8704, "step": 521700 }, { "epoch": 30.519974264490845, "grad_norm": 0.18247045576572418, "learning_rate": 0.001, "loss": 1.8614, "step": 521800 }, { "epoch": 30.52582324384395, "grad_norm": 0.16222573816776276, "learning_rate": 0.001, "loss": 1.863, "step": 521900 }, { "epoch": 30.531672223197052, "grad_norm": 0.23915377259254456, "learning_rate": 0.001, "loss": 1.8622, "step": 522000 }, { "epoch": 30.537521202550156, "grad_norm": 0.3579806983470917, "learning_rate": 0.001, "loss": 1.868, "step": 522100 }, { "epoch": 30.543370181903256, "grad_norm": 0.21278797090053558, "learning_rate": 0.001, "loss": 1.8749, "step": 522200 }, { "epoch": 30.54921916125636, "grad_norm": 0.13960418105125427, "learning_rate": 0.001, "loss": 1.868, "step": 522300 }, { "epoch": 30.555068140609464, "grad_norm": 0.16288445889949799, "learning_rate": 0.001, "loss": 1.8694, "step": 522400 }, { "epoch": 30.560917119962568, "grad_norm": 0.196125790476799, "learning_rate": 0.001, "loss": 1.8688, "step": 522500 }, { "epoch": 30.566766099315668, "grad_norm": 0.17735864222049713, "learning_rate": 0.001, "loss": 1.8644, "step": 522600 }, { "epoch": 30.572615078668772, "grad_norm": 0.3031361401081085, "learning_rate": 0.001, "loss": 1.8705, "step": 522700 }, { "epoch": 30.578464058021876, "grad_norm": 0.18316753208637238, "learning_rate": 0.001, "loss": 1.8651, "step": 522800 }, { "epoch": 30.58431303737498, "grad_norm": 0.19923503696918488, "learning_rate": 0.001, "loss": 1.87, "step": 522900 }, { "epoch": 30.59016201672808, "grad_norm": 0.2679513990879059, "learning_rate": 0.001, "loss": 1.8646, "step": 523000 }, { "epoch": 30.596010996081183, "grad_norm": 0.19404132664203644, "learning_rate": 0.001, "loss": 1.8696, "step": 523100 }, { "epoch": 30.601859975434287, "grad_norm": 0.23398777842521667, "learning_rate": 0.001, "loss": 1.8651, "step": 523200 }, { "epoch": 30.60770895478739, "grad_norm": 0.2372518628835678, "learning_rate": 0.001, "loss": 1.8662, "step": 523300 }, { "epoch": 30.61355793414049, "grad_norm": 0.30955061316490173, "learning_rate": 0.001, "loss": 1.8651, "step": 523400 }, { "epoch": 30.619406913493595, "grad_norm": 0.2097671926021576, "learning_rate": 0.001, "loss": 1.8714, "step": 523500 }, { "epoch": 30.6252558928467, "grad_norm": 0.22834303975105286, "learning_rate": 0.001, "loss": 1.8695, "step": 523600 }, { "epoch": 30.631104872199803, "grad_norm": 0.1902168095111847, "learning_rate": 0.001, "loss": 1.8668, "step": 523700 }, { "epoch": 30.636953851552903, "grad_norm": 0.18755780160427094, "learning_rate": 0.001, "loss": 1.8648, "step": 523800 }, { "epoch": 30.642802830906007, "grad_norm": 0.19142314791679382, "learning_rate": 0.001, "loss": 1.8681, "step": 523900 }, { "epoch": 30.64865181025911, "grad_norm": 0.1903713494539261, "learning_rate": 0.001, "loss": 1.8702, "step": 524000 }, { "epoch": 30.654500789612214, "grad_norm": 0.19048212468624115, "learning_rate": 0.001, "loss": 1.8652, "step": 524100 }, { "epoch": 30.660349768965315, "grad_norm": 0.2130136936903, "learning_rate": 0.001, "loss": 1.8676, "step": 524200 }, { "epoch": 30.66619874831842, "grad_norm": 0.2218911498785019, "learning_rate": 0.001, "loss": 1.8739, "step": 524300 }, { "epoch": 30.672047727671522, "grad_norm": 0.19332343339920044, "learning_rate": 0.001, "loss": 1.8704, "step": 524400 }, { "epoch": 30.677896707024622, "grad_norm": 0.21147426962852478, "learning_rate": 0.001, "loss": 1.8667, "step": 524500 }, { "epoch": 30.683745686377726, "grad_norm": 0.18482346832752228, "learning_rate": 0.001, "loss": 1.8726, "step": 524600 }, { "epoch": 30.68959466573083, "grad_norm": 0.20849908888339996, "learning_rate": 0.001, "loss": 1.868, "step": 524700 }, { "epoch": 30.695443645083934, "grad_norm": 0.1996627002954483, "learning_rate": 0.001, "loss": 1.8708, "step": 524800 }, { "epoch": 30.701292624437034, "grad_norm": 0.17275631427764893, "learning_rate": 0.001, "loss": 1.869, "step": 524900 }, { "epoch": 30.707141603790138, "grad_norm": 0.2811110019683838, "learning_rate": 0.001, "loss": 1.8705, "step": 525000 }, { "epoch": 30.707141603790138, "eval_ag_news_accuracy": 0.237234375, "eval_ag_news_bleu_score": 8.189551876770672, "eval_ag_news_bleu_score_sem": 0.5311928332978683, "eval_ag_news_emb_cos_sim": 0.7507146596908569, "eval_ag_news_emb_cos_sim_sem": 0.010596547275781631, "eval_ag_news_emb_top1_equal": 0.984375, "eval_ag_news_emb_top1_equal_sem": 0.011004959233105183, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.75152325630188, "eval_ag_news_n_ngrams_match_1": 15.5859375, "eval_ag_news_n_ngrams_match_2": 4.625, "eval_ag_news_n_ngrams_match_3": 1.7890625, "eval_ag_news_num_pred_words": 47.7109375, "eval_ag_news_num_true_words": 46.5, "eval_ag_news_perplexity": 15.666477778877606, "eval_ag_news_pred_num_tokens": 68.8515625, "eval_ag_news_rouge_score": 0.31189117841177405, "eval_ag_news_runtime": 37.5584, "eval_ag_news_samples_per_second": 13.313, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3490120916312098, "eval_ag_news_token_set_f1_sem": 0.009071885341859945, "eval_ag_news_token_set_precision": 0.33042331984398143, "eval_ag_news_token_set_recall": 0.37503157343712984, "eval_ag_news_true_num_tokens": 63.6015625, "step": 525000 }, { "epoch": 30.707141603790138, "eval_anthropic_toxic_prompts_accuracy": 0.102796875, "eval_anthropic_toxic_prompts_bleu_score": 41.5437605697897, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.5673471872168188, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8867213726043701, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.00933081191033125, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1328125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030114393430435732, "eval_anthropic_toxic_prompts_loss": 1.2692956924438477, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.4609375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.4453125, "eval_anthropic_toxic_prompts_num_pred_words": 14.9296875, "eval_anthropic_toxic_prompts_num_true_words": 14.8515625, "eval_anthropic_toxic_prompts_perplexity": 3.5583455099611165, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.234375, "eval_anthropic_toxic_prompts_rouge_score": 0.6898887653720366, "eval_anthropic_toxic_prompts_runtime": 28.2166, "eval_anthropic_toxic_prompts_samples_per_second": 17.72, "eval_anthropic_toxic_prompts_steps_per_second": 0.035, "eval_anthropic_toxic_prompts_token_set_f1": 0.709998875495418, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018329410902947176, "eval_anthropic_toxic_prompts_token_set_precision": 0.7114534734003648, "eval_anthropic_toxic_prompts_token_set_recall": 0.7154375656827273, "eval_anthropic_toxic_prompts_true_num_tokens": 18.1796875, "step": 525000 }, { "epoch": 30.707141603790138, "eval_arxiv_accuracy": 0.369484375, "eval_arxiv_bleu_score": 1.8699505438494946, "eval_arxiv_bleu_score_sem": 0.19640657948827184, "eval_arxiv_emb_cos_sim": 0.48168060183525085, "eval_arxiv_emb_cos_sim_sem": 0.01927620731294155, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4449362754821777, "eval_arxiv_n_ngrams_match_1": 12.5703125, "eval_arxiv_n_ngrams_match_2": 2.234375, "eval_arxiv_n_ngrams_match_3": 0.5859375, "eval_arxiv_num_pred_words": 51.671875, "eval_arxiv_num_true_words": 84.5859375, "eval_arxiv_perplexity": 31.34128617468305, "eval_arxiv_pred_num_tokens": 124.9296875, "eval_arxiv_rouge_score": 0.17533841343878487, "eval_arxiv_runtime": 29.8201, "eval_arxiv_samples_per_second": 16.767, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.18168293696805154, "eval_arxiv_token_set_f1_sem": 0.008645992370955388, "eval_arxiv_token_set_precision": 0.12454980972497125, "eval_arxiv_token_set_recall": 0.41175474827977926, "eval_arxiv_true_num_tokens": 123.25, "step": 525000 }, { "epoch": 30.707141603790138, "eval_python_code_alpaca_accuracy": 0.128265625, "eval_python_code_alpaca_bleu_score": 26.000973543549257, "eval_python_code_alpaca_bleu_score_sem": 1.5205632088797647, "eval_python_code_alpaca_emb_cos_sim": 0.8650904893875122, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008371585048735142, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.512217402458191, "eval_python_code_alpaca_n_ngrams_match_1": 9.7421875, "eval_python_code_alpaca_n_ngrams_match_2": 5.234375, "eval_python_code_alpaca_n_ngrams_match_3": 2.953125, "eval_python_code_alpaca_num_pred_words": 16.78125, "eval_python_code_alpaca_num_true_words": 18.5234375, "eval_python_code_alpaca_perplexity": 4.536779515216992, "eval_python_code_alpaca_pred_num_tokens": 23.2734375, "eval_python_code_alpaca_rouge_score": 0.5677988591481844, "eval_python_code_alpaca_runtime": 28.9306, "eval_python_code_alpaca_samples_per_second": 17.283, "eval_python_code_alpaca_steps_per_second": 0.035, "eval_python_code_alpaca_token_set_f1": 0.5877923566645928, "eval_python_code_alpaca_token_set_f1_sem": 0.01413387616332784, "eval_python_code_alpaca_token_set_precision": 0.5728817284908344, "eval_python_code_alpaca_token_set_recall": 0.6134938739446646, "eval_python_code_alpaca_true_num_tokens": 23.515625, "step": 525000 }, { "epoch": 30.707141603790138, "eval_wikibio_accuracy": 0.353671875, "eval_wikibio_bleu_score": 7.525900860615304, "eval_wikibio_bleu_score_sem": 0.7153289653576793, "eval_wikibio_emb_cos_sim": 0.6231575012207031, "eval_wikibio_emb_cos_sim_sem": 0.021853355690836906, "eval_wikibio_emb_top1_equal": 0.9609375, "eval_wikibio_emb_top1_equal_sem": 0.017191974446177483, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7368717193603516, "eval_wikibio_n_ngrams_match_1": 14.703125, "eval_wikibio_n_ngrams_match_2": 5.2578125, "eval_wikibio_n_ngrams_match_3": 2.109375, "eval_wikibio_num_pred_words": 51.5703125, "eval_wikibio_num_true_words": 53.6171875, "eval_wikibio_perplexity": 15.438613160639377, "eval_wikibio_pred_num_tokens": 102.9453125, "eval_wikibio_rouge_score": 0.298998808965784, "eval_wikibio_runtime": 30.7978, "eval_wikibio_samples_per_second": 16.235, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.3208222133624699, "eval_wikibio_token_set_f1_sem": 0.012371800938540558, "eval_wikibio_token_set_precision": 0.2841643782804254, "eval_wikibio_token_set_recall": 0.4076435111417992, "eval_wikibio_true_num_tokens": 100.8828125, "step": 525000 }, { "epoch": 30.707141603790138, "eval_msmarco_accuracy": 0.392328125, "eval_msmarco_bleu_score": 17.46492380224579, "eval_msmarco_bleu_score_sem": 1.394959995386065, "eval_msmarco_emb_cos_sim": 0.7750993371009827, "eval_msmarco_emb_cos_sim_sem": 0.01724708452820778, "eval_msmarco_emb_top1_equal": 0.9453125, "eval_msmarco_emb_top1_equal_sem": 0.020175758749246597, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7499244213104248, "eval_msmarco_n_ngrams_match_1": 29.34375, "eval_msmarco_n_ngrams_match_2": 13.46875, "eval_msmarco_n_ngrams_match_3": 7.8671875, "eval_msmarco_num_pred_words": 62.0703125, "eval_msmarco_num_true_words": 64.28125, "eval_msmarco_perplexity": 5.754167767111581, "eval_msmarco_pred_num_tokens": 84.34375, "eval_msmarco_rouge_score": 0.4464490834355612, "eval_msmarco_runtime": 25.2463, "eval_msmarco_samples_per_second": 19.805, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.47402411816506723, "eval_msmarco_token_set_f1_sem": 0.014622624476354483, "eval_msmarco_token_set_precision": 0.4350408116209397, "eval_msmarco_token_set_recall": 0.5509906210645091, "eval_msmarco_true_num_tokens": 82.21875, "step": 525000 }, { "epoch": 30.71299058314324, "grad_norm": 0.2083139270544052, "learning_rate": 0.001, "loss": 1.8704, "step": 525100 }, { "epoch": 30.718839562496346, "grad_norm": 0.17064958810806274, "learning_rate": 0.001, "loss": 1.8691, "step": 525200 }, { "epoch": 30.724688541849446, "grad_norm": 0.1565273106098175, "learning_rate": 0.001, "loss": 1.8696, "step": 525300 }, { "epoch": 30.73053752120255, "grad_norm": 0.1769000142812729, "learning_rate": 0.001, "loss": 1.8711, "step": 525400 }, { "epoch": 30.736386500555653, "grad_norm": 0.2437119036912918, "learning_rate": 0.001, "loss": 1.8742, "step": 525500 }, { "epoch": 30.742235479908757, "grad_norm": 0.19734631478786469, "learning_rate": 0.001, "loss": 1.8654, "step": 525600 }, { "epoch": 30.748084459261857, "grad_norm": 0.1939842849969864, "learning_rate": 0.001, "loss": 1.8772, "step": 525700 }, { "epoch": 30.75393343861496, "grad_norm": 0.2331225723028183, "learning_rate": 0.001, "loss": 1.8662, "step": 525800 }, { "epoch": 30.759782417968065, "grad_norm": 0.18473245203495026, "learning_rate": 0.001, "loss": 1.8643, "step": 525900 }, { "epoch": 30.76563139732117, "grad_norm": 0.1836753785610199, "learning_rate": 0.001, "loss": 1.8689, "step": 526000 }, { "epoch": 30.77148037667427, "grad_norm": 0.22843915224075317, "learning_rate": 0.001, "loss": 1.863, "step": 526100 }, { "epoch": 30.777329356027373, "grad_norm": 0.22260557115077972, "learning_rate": 0.001, "loss": 1.8732, "step": 526200 }, { "epoch": 30.783178335380477, "grad_norm": 0.2241542786359787, "learning_rate": 0.001, "loss": 1.8759, "step": 526300 }, { "epoch": 30.78902731473358, "grad_norm": 0.1672990471124649, "learning_rate": 0.001, "loss": 1.869, "step": 526400 }, { "epoch": 30.79487629408668, "grad_norm": 0.19661138951778412, "learning_rate": 0.001, "loss": 1.8743, "step": 526500 }, { "epoch": 30.800725273439785, "grad_norm": 0.17884626984596252, "learning_rate": 0.001, "loss": 1.8716, "step": 526600 }, { "epoch": 30.80657425279289, "grad_norm": 0.15408234298229218, "learning_rate": 0.001, "loss": 1.8738, "step": 526700 }, { "epoch": 30.812423232145992, "grad_norm": 0.20201033353805542, "learning_rate": 0.001, "loss": 1.8733, "step": 526800 }, { "epoch": 30.818272211499092, "grad_norm": 0.1948365569114685, "learning_rate": 0.001, "loss": 1.8728, "step": 526900 }, { "epoch": 30.824121190852196, "grad_norm": 0.19813470542430878, "learning_rate": 0.001, "loss": 1.8693, "step": 527000 }, { "epoch": 30.8299701702053, "grad_norm": 0.23301197588443756, "learning_rate": 0.001, "loss": 1.8739, "step": 527100 }, { "epoch": 30.835819149558404, "grad_norm": 0.22613683342933655, "learning_rate": 0.001, "loss": 1.8639, "step": 527200 }, { "epoch": 30.841668128911504, "grad_norm": 0.1672244817018509, "learning_rate": 0.001, "loss": 1.8629, "step": 527300 }, { "epoch": 30.847517108264608, "grad_norm": 0.20938554406166077, "learning_rate": 0.001, "loss": 1.8703, "step": 527400 }, { "epoch": 30.85336608761771, "grad_norm": 0.17336425185203552, "learning_rate": 0.001, "loss": 1.8581, "step": 527500 }, { "epoch": 30.859215066970812, "grad_norm": 0.19294880330562592, "learning_rate": 0.001, "loss": 1.8722, "step": 527600 }, { "epoch": 30.865064046323916, "grad_norm": 0.16961432993412018, "learning_rate": 0.001, "loss": 1.8712, "step": 527700 }, { "epoch": 30.87091302567702, "grad_norm": 0.17090144753456116, "learning_rate": 0.001, "loss": 1.8652, "step": 527800 }, { "epoch": 30.876762005030123, "grad_norm": 0.1906457245349884, "learning_rate": 0.001, "loss": 1.8704, "step": 527900 }, { "epoch": 30.882610984383223, "grad_norm": 0.18602778017520905, "learning_rate": 0.001, "loss": 1.8658, "step": 528000 }, { "epoch": 30.888459963736327, "grad_norm": 0.18469083309173584, "learning_rate": 0.001, "loss": 1.8716, "step": 528100 }, { "epoch": 30.89430894308943, "grad_norm": 0.28300032019615173, "learning_rate": 0.001, "loss": 1.8733, "step": 528200 }, { "epoch": 30.900157922442535, "grad_norm": 0.13943909108638763, "learning_rate": 0.001, "loss": 1.867, "step": 528300 }, { "epoch": 30.906006901795635, "grad_norm": 0.1554625779390335, "learning_rate": 0.001, "loss": 1.8672, "step": 528400 }, { "epoch": 30.91185588114874, "grad_norm": 0.17717546224594116, "learning_rate": 0.001, "loss": 1.8677, "step": 528500 }, { "epoch": 30.917704860501843, "grad_norm": 0.18096476793289185, "learning_rate": 0.001, "loss": 1.8695, "step": 528600 }, { "epoch": 30.923553839854947, "grad_norm": 0.183620423078537, "learning_rate": 0.001, "loss": 1.8726, "step": 528700 }, { "epoch": 30.929402819208047, "grad_norm": 0.19762232899665833, "learning_rate": 0.001, "loss": 1.8684, "step": 528800 }, { "epoch": 30.93525179856115, "grad_norm": 0.1612246036529541, "learning_rate": 0.001, "loss": 1.8697, "step": 528900 }, { "epoch": 30.941100777914254, "grad_norm": 0.2847733199596405, "learning_rate": 0.001, "loss": 1.8714, "step": 529000 }, { "epoch": 30.946949757267358, "grad_norm": 0.19648919999599457, "learning_rate": 0.001, "loss": 1.8682, "step": 529100 }, { "epoch": 30.95279873662046, "grad_norm": 0.28451263904571533, "learning_rate": 0.001, "loss": 1.8738, "step": 529200 }, { "epoch": 30.958647715973562, "grad_norm": 0.23091275990009308, "learning_rate": 0.001, "loss": 1.8743, "step": 529300 }, { "epoch": 30.964496695326666, "grad_norm": 0.15380285680294037, "learning_rate": 0.001, "loss": 1.8656, "step": 529400 }, { "epoch": 30.97034567467977, "grad_norm": 0.3371586203575134, "learning_rate": 0.001, "loss": 1.8717, "step": 529500 }, { "epoch": 30.97619465403287, "grad_norm": 0.14742615818977356, "learning_rate": 0.001, "loss": 1.8716, "step": 529600 }, { "epoch": 30.982043633385974, "grad_norm": 0.19813945889472961, "learning_rate": 0.001, "loss": 1.8616, "step": 529700 }, { "epoch": 30.987892612739078, "grad_norm": 0.21296004951000214, "learning_rate": 0.001, "loss": 1.8692, "step": 529800 }, { "epoch": 30.99374159209218, "grad_norm": 0.1620495766401291, "learning_rate": 0.001, "loss": 1.8672, "step": 529900 }, { "epoch": 30.99959057144528, "grad_norm": 0.1825082153081894, "learning_rate": 0.001, "loss": 1.8676, "step": 530000 }, { "epoch": 31.005439550798386, "grad_norm": 0.2019050419330597, "learning_rate": 0.001, "loss": 1.8613, "step": 530100 }, { "epoch": 31.01128853015149, "grad_norm": 0.22084783017635345, "learning_rate": 0.001, "loss": 1.8571, "step": 530200 }, { "epoch": 31.017137509504593, "grad_norm": 0.20047466456890106, "learning_rate": 0.001, "loss": 1.8544, "step": 530300 }, { "epoch": 31.022986488857693, "grad_norm": 0.2305893898010254, "learning_rate": 0.001, "loss": 1.8588, "step": 530400 }, { "epoch": 31.028835468210797, "grad_norm": 0.1762876659631729, "learning_rate": 0.001, "loss": 1.8554, "step": 530500 }, { "epoch": 31.0346844475639, "grad_norm": 0.20521900057792664, "learning_rate": 0.001, "loss": 1.8598, "step": 530600 }, { "epoch": 31.040533426917, "grad_norm": 0.18554919958114624, "learning_rate": 0.001, "loss": 1.8577, "step": 530700 }, { "epoch": 31.046382406270105, "grad_norm": 0.22389458119869232, "learning_rate": 0.001, "loss": 1.8531, "step": 530800 }, { "epoch": 31.05223138562321, "grad_norm": 0.18045306205749512, "learning_rate": 0.001, "loss": 1.8549, "step": 530900 }, { "epoch": 31.058080364976313, "grad_norm": 0.19868361949920654, "learning_rate": 0.001, "loss": 1.8496, "step": 531000 }, { "epoch": 31.063929344329413, "grad_norm": 0.2524946928024292, "learning_rate": 0.001, "loss": 1.8517, "step": 531100 }, { "epoch": 31.069778323682517, "grad_norm": 0.21624228358268738, "learning_rate": 0.001, "loss": 1.8562, "step": 531200 }, { "epoch": 31.07562730303562, "grad_norm": 0.1912635862827301, "learning_rate": 0.001, "loss": 1.8609, "step": 531300 }, { "epoch": 31.081476282388724, "grad_norm": 0.20054063200950623, "learning_rate": 0.001, "loss": 1.859, "step": 531400 }, { "epoch": 31.087325261741825, "grad_norm": 0.18153995275497437, "learning_rate": 0.001, "loss": 1.8569, "step": 531500 }, { "epoch": 31.09317424109493, "grad_norm": 0.18293015658855438, "learning_rate": 0.001, "loss": 1.8629, "step": 531600 }, { "epoch": 31.099023220448032, "grad_norm": 0.1657881736755371, "learning_rate": 0.001, "loss": 1.8563, "step": 531700 }, { "epoch": 31.104872199801136, "grad_norm": 0.23803368210792542, "learning_rate": 0.001, "loss": 1.8549, "step": 531800 }, { "epoch": 31.110721179154236, "grad_norm": 0.2644134759902954, "learning_rate": 0.001, "loss": 1.8568, "step": 531900 }, { "epoch": 31.11657015850734, "grad_norm": 0.21815554797649384, "learning_rate": 0.001, "loss": 1.8564, "step": 532000 }, { "epoch": 31.122419137860444, "grad_norm": 0.20893293619155884, "learning_rate": 0.001, "loss": 1.8611, "step": 532100 }, { "epoch": 31.128268117213548, "grad_norm": 0.13870848715305328, "learning_rate": 0.001, "loss": 1.8541, "step": 532200 }, { "epoch": 31.134117096566648, "grad_norm": 0.1911594420671463, "learning_rate": 0.001, "loss": 1.8573, "step": 532300 }, { "epoch": 31.13996607591975, "grad_norm": 0.26248350739479065, "learning_rate": 0.001, "loss": 1.8612, "step": 532400 }, { "epoch": 31.145815055272855, "grad_norm": 0.19776779413223267, "learning_rate": 0.001, "loss": 1.8545, "step": 532500 }, { "epoch": 31.15166403462596, "grad_norm": 0.21618930995464325, "learning_rate": 0.001, "loss": 1.8557, "step": 532600 }, { "epoch": 31.15751301397906, "grad_norm": 0.16028353571891785, "learning_rate": 0.001, "loss": 1.8545, "step": 532700 }, { "epoch": 31.163361993332163, "grad_norm": 0.2016591727733612, "learning_rate": 0.001, "loss": 1.8605, "step": 532800 }, { "epoch": 31.169210972685267, "grad_norm": 0.21550866961479187, "learning_rate": 0.001, "loss": 1.8528, "step": 532900 }, { "epoch": 31.17505995203837, "grad_norm": 0.31089240312576294, "learning_rate": 0.001, "loss": 1.8603, "step": 533000 }, { "epoch": 31.18090893139147, "grad_norm": 0.19132372736930847, "learning_rate": 0.001, "loss": 1.8632, "step": 533100 }, { "epoch": 31.186757910744575, "grad_norm": 0.15665927529335022, "learning_rate": 0.001, "loss": 1.8524, "step": 533200 }, { "epoch": 31.19260689009768, "grad_norm": 0.19810563325881958, "learning_rate": 0.001, "loss": 1.865, "step": 533300 }, { "epoch": 31.198455869450783, "grad_norm": 0.20976294577121735, "learning_rate": 0.001, "loss": 1.8586, "step": 533400 }, { "epoch": 31.204304848803883, "grad_norm": 0.22204867005348206, "learning_rate": 0.001, "loss": 1.8644, "step": 533500 }, { "epoch": 31.210153828156987, "grad_norm": 0.1990436464548111, "learning_rate": 0.001, "loss": 1.8663, "step": 533600 }, { "epoch": 31.21600280751009, "grad_norm": 0.2123420685529709, "learning_rate": 0.001, "loss": 1.8629, "step": 533700 }, { "epoch": 31.22185178686319, "grad_norm": 0.1847943812608719, "learning_rate": 0.001, "loss": 1.854, "step": 533800 }, { "epoch": 31.227700766216294, "grad_norm": 0.1970657855272293, "learning_rate": 0.001, "loss": 1.8591, "step": 533900 }, { "epoch": 31.2335497455694, "grad_norm": 0.19255289435386658, "learning_rate": 0.001, "loss": 1.8559, "step": 534000 }, { "epoch": 31.239398724922502, "grad_norm": 0.2527511417865753, "learning_rate": 0.001, "loss": 1.861, "step": 534100 }, { "epoch": 31.245247704275602, "grad_norm": 0.2141774445772171, "learning_rate": 0.001, "loss": 1.8611, "step": 534200 }, { "epoch": 31.251096683628706, "grad_norm": 0.1947704255580902, "learning_rate": 0.001, "loss": 1.8565, "step": 534300 }, { "epoch": 31.25694566298181, "grad_norm": 0.39558643102645874, "learning_rate": 0.001, "loss": 1.8592, "step": 534400 }, { "epoch": 31.262794642334914, "grad_norm": 0.15511202812194824, "learning_rate": 0.001, "loss": 1.8598, "step": 534500 }, { "epoch": 31.268643621688014, "grad_norm": 0.2208353579044342, "learning_rate": 0.001, "loss": 1.8589, "step": 534600 }, { "epoch": 31.274492601041118, "grad_norm": 0.19104990363121033, "learning_rate": 0.001, "loss": 1.8645, "step": 534700 }, { "epoch": 31.28034158039422, "grad_norm": 0.23068618774414062, "learning_rate": 0.001, "loss": 1.861, "step": 534800 }, { "epoch": 31.286190559747325, "grad_norm": 0.2621937692165375, "learning_rate": 0.001, "loss": 1.8617, "step": 534900 }, { "epoch": 31.292039539100426, "grad_norm": 0.20127232372760773, "learning_rate": 0.001, "loss": 1.8613, "step": 535000 }, { "epoch": 31.29788851845353, "grad_norm": 0.23755918443202972, "learning_rate": 0.001, "loss": 1.8544, "step": 535100 }, { "epoch": 31.303737497806633, "grad_norm": 0.22378034889698029, "learning_rate": 0.001, "loss": 1.8531, "step": 535200 }, { "epoch": 31.309586477159737, "grad_norm": 0.1913299858570099, "learning_rate": 0.001, "loss": 1.8668, "step": 535300 }, { "epoch": 31.315435456512837, "grad_norm": 0.2308385819196701, "learning_rate": 0.001, "loss": 1.8722, "step": 535400 }, { "epoch": 31.32128443586594, "grad_norm": 0.20678876340389252, "learning_rate": 0.001, "loss": 1.8625, "step": 535500 }, { "epoch": 31.327133415219045, "grad_norm": 0.18197636306285858, "learning_rate": 0.001, "loss": 1.8662, "step": 535600 }, { "epoch": 31.33298239457215, "grad_norm": 0.21074657142162323, "learning_rate": 0.001, "loss": 1.863, "step": 535700 }, { "epoch": 31.33883137392525, "grad_norm": 0.18548400700092316, "learning_rate": 0.001, "loss": 1.8626, "step": 535800 }, { "epoch": 31.344680353278353, "grad_norm": 0.15014639496803284, "learning_rate": 0.001, "loss": 1.8594, "step": 535900 }, { "epoch": 31.350529332631456, "grad_norm": 0.18488116562366486, "learning_rate": 0.001, "loss": 1.8647, "step": 536000 }, { "epoch": 31.35637831198456, "grad_norm": 0.1972818672657013, "learning_rate": 0.001, "loss": 1.8609, "step": 536100 }, { "epoch": 31.36222729133766, "grad_norm": 0.2463124692440033, "learning_rate": 0.001, "loss": 1.8654, "step": 536200 }, { "epoch": 31.368076270690764, "grad_norm": 0.20091480016708374, "learning_rate": 0.001, "loss": 1.863, "step": 536300 }, { "epoch": 31.373925250043868, "grad_norm": 0.23999197781085968, "learning_rate": 0.001, "loss": 1.8641, "step": 536400 }, { "epoch": 31.379774229396972, "grad_norm": 0.22848697006702423, "learning_rate": 0.001, "loss": 1.8648, "step": 536500 }, { "epoch": 31.385623208750072, "grad_norm": 0.25316083431243896, "learning_rate": 0.001, "loss": 1.8677, "step": 536600 }, { "epoch": 31.391472188103176, "grad_norm": 0.24020008742809296, "learning_rate": 0.001, "loss": 1.868, "step": 536700 }, { "epoch": 31.39732116745628, "grad_norm": 0.15426748991012573, "learning_rate": 0.001, "loss": 1.861, "step": 536800 }, { "epoch": 31.40317014680938, "grad_norm": 0.22594060003757477, "learning_rate": 0.001, "loss": 1.8619, "step": 536900 }, { "epoch": 31.409019126162484, "grad_norm": 0.17124374210834503, "learning_rate": 0.001, "loss": 1.8632, "step": 537000 }, { "epoch": 31.414868105515588, "grad_norm": 0.26854610443115234, "learning_rate": 0.001, "loss": 1.8632, "step": 537100 }, { "epoch": 31.42071708486869, "grad_norm": 0.1547001153230667, "learning_rate": 0.001, "loss": 1.8613, "step": 537200 }, { "epoch": 31.42656606422179, "grad_norm": 0.19423596560955048, "learning_rate": 0.001, "loss": 1.8594, "step": 537300 }, { "epoch": 31.432415043574895, "grad_norm": 0.21727554500102997, "learning_rate": 0.001, "loss": 1.8626, "step": 537400 }, { "epoch": 31.438264022928, "grad_norm": 0.26881247758865356, "learning_rate": 0.001, "loss": 1.8608, "step": 537500 }, { "epoch": 31.444113002281103, "grad_norm": 0.2261084020137787, "learning_rate": 0.001, "loss": 1.8664, "step": 537600 }, { "epoch": 31.449961981634203, "grad_norm": 0.2015170007944107, "learning_rate": 0.001, "loss": 1.8599, "step": 537700 }, { "epoch": 31.455810960987307, "grad_norm": 0.1707366406917572, "learning_rate": 0.001, "loss": 1.8634, "step": 537800 }, { "epoch": 31.46165994034041, "grad_norm": 0.16246408224105835, "learning_rate": 0.001, "loss": 1.8618, "step": 537900 }, { "epoch": 31.467508919693515, "grad_norm": 0.16960377991199493, "learning_rate": 0.001, "loss": 1.8689, "step": 538000 }, { "epoch": 31.473357899046615, "grad_norm": 0.21368667483329773, "learning_rate": 0.001, "loss": 1.8651, "step": 538100 }, { "epoch": 31.47920687839972, "grad_norm": 0.1727307289838791, "learning_rate": 0.001, "loss": 1.8583, "step": 538200 }, { "epoch": 31.485055857752823, "grad_norm": 0.20416082441806793, "learning_rate": 0.001, "loss": 1.8572, "step": 538300 }, { "epoch": 31.490904837105926, "grad_norm": 0.1779516339302063, "learning_rate": 0.001, "loss": 1.8638, "step": 538400 }, { "epoch": 31.496753816459027, "grad_norm": 0.14847999811172485, "learning_rate": 0.001, "loss": 1.8648, "step": 538500 }, { "epoch": 31.50260279581213, "grad_norm": 0.24759256839752197, "learning_rate": 0.001, "loss": 1.8662, "step": 538600 }, { "epoch": 31.508451775165234, "grad_norm": 0.24260710179805756, "learning_rate": 0.001, "loss": 1.8558, "step": 538700 }, { "epoch": 31.514300754518338, "grad_norm": 0.21349862217903137, "learning_rate": 0.001, "loss": 1.8626, "step": 538800 }, { "epoch": 31.52014973387144, "grad_norm": 0.1856163591146469, "learning_rate": 0.001, "loss": 1.863, "step": 538900 }, { "epoch": 31.525998713224542, "grad_norm": 0.20078144967556, "learning_rate": 0.001, "loss": 1.8633, "step": 539000 }, { "epoch": 31.531847692577646, "grad_norm": 0.22695380449295044, "learning_rate": 0.001, "loss": 1.8641, "step": 539100 }, { "epoch": 31.53769667193075, "grad_norm": 0.20216187834739685, "learning_rate": 0.001, "loss": 1.8685, "step": 539200 }, { "epoch": 31.54354565128385, "grad_norm": 0.17493395507335663, "learning_rate": 0.001, "loss": 1.8659, "step": 539300 }, { "epoch": 31.549394630636954, "grad_norm": 0.17678657174110413, "learning_rate": 0.001, "loss": 1.8662, "step": 539400 }, { "epoch": 31.555243609990058, "grad_norm": 0.2172500044107437, "learning_rate": 0.001, "loss": 1.8662, "step": 539500 }, { "epoch": 31.56109258934316, "grad_norm": 0.17354810237884521, "learning_rate": 0.001, "loss": 1.8674, "step": 539600 }, { "epoch": 31.56694156869626, "grad_norm": 0.19481907784938812, "learning_rate": 0.001, "loss": 1.8709, "step": 539700 }, { "epoch": 31.572790548049365, "grad_norm": 0.23391114175319672, "learning_rate": 0.001, "loss": 1.8667, "step": 539800 }, { "epoch": 31.57863952740247, "grad_norm": 0.21317104995250702, "learning_rate": 0.001, "loss": 1.864, "step": 539900 }, { "epoch": 31.58448850675557, "grad_norm": 0.2233983278274536, "learning_rate": 0.001, "loss": 1.8689, "step": 540000 }, { "epoch": 31.590337486108673, "grad_norm": 0.19891002774238586, "learning_rate": 0.001, "loss": 1.874, "step": 540100 }, { "epoch": 31.596186465461777, "grad_norm": 0.21616551280021667, "learning_rate": 0.001, "loss": 1.8686, "step": 540200 }, { "epoch": 31.60203544481488, "grad_norm": 0.1911880522966385, "learning_rate": 0.001, "loss": 1.8642, "step": 540300 }, { "epoch": 31.60788442416798, "grad_norm": 0.1872926652431488, "learning_rate": 0.001, "loss": 1.866, "step": 540400 }, { "epoch": 31.613733403521085, "grad_norm": 0.2272842824459076, "learning_rate": 0.001, "loss": 1.8671, "step": 540500 }, { "epoch": 31.61958238287419, "grad_norm": 0.2501463294029236, "learning_rate": 0.001, "loss": 1.8714, "step": 540600 }, { "epoch": 31.625431362227292, "grad_norm": 0.21343885362148285, "learning_rate": 0.001, "loss": 1.8665, "step": 540700 }, { "epoch": 31.631280341580393, "grad_norm": 0.1827097088098526, "learning_rate": 0.001, "loss": 1.8685, "step": 540800 }, { "epoch": 31.637129320933497, "grad_norm": 0.189096599817276, "learning_rate": 0.001, "loss": 1.8594, "step": 540900 }, { "epoch": 31.6429783002866, "grad_norm": 0.2109164595603943, "learning_rate": 0.001, "loss": 1.8621, "step": 541000 }, { "epoch": 31.648827279639704, "grad_norm": 0.19657623767852783, "learning_rate": 0.001, "loss": 1.867, "step": 541100 }, { "epoch": 31.654676258992804, "grad_norm": 0.239857017993927, "learning_rate": 0.001, "loss": 1.8619, "step": 541200 }, { "epoch": 31.660525238345908, "grad_norm": 0.19991575181484222, "learning_rate": 0.001, "loss": 1.8651, "step": 541300 }, { "epoch": 31.666374217699012, "grad_norm": 0.23650479316711426, "learning_rate": 0.001, "loss": 1.8661, "step": 541400 }, { "epoch": 31.672223197052116, "grad_norm": 0.19596900045871735, "learning_rate": 0.001, "loss": 1.8714, "step": 541500 }, { "epoch": 31.678072176405216, "grad_norm": 0.22386577725410461, "learning_rate": 0.001, "loss": 1.8659, "step": 541600 }, { "epoch": 31.68392115575832, "grad_norm": 0.16514551639556885, "learning_rate": 0.001, "loss": 1.8675, "step": 541700 }, { "epoch": 31.689770135111424, "grad_norm": 0.2150893211364746, "learning_rate": 0.001, "loss": 1.8634, "step": 541800 }, { "epoch": 31.695619114464527, "grad_norm": 0.25738605856895447, "learning_rate": 0.001, "loss": 1.8607, "step": 541900 }, { "epoch": 31.701468093817628, "grad_norm": 0.2370019108057022, "learning_rate": 0.001, "loss": 1.8602, "step": 542000 }, { "epoch": 31.70731707317073, "grad_norm": 0.2539718747138977, "learning_rate": 0.001, "loss": 1.8652, "step": 542100 }, { "epoch": 31.713166052523835, "grad_norm": 0.1949501931667328, "learning_rate": 0.001, "loss": 1.8653, "step": 542200 }, { "epoch": 31.71901503187694, "grad_norm": 0.19289253652095795, "learning_rate": 0.001, "loss": 1.867, "step": 542300 }, { "epoch": 31.72486401123004, "grad_norm": 0.17397379875183105, "learning_rate": 0.001, "loss": 1.8607, "step": 542400 }, { "epoch": 31.730712990583143, "grad_norm": 0.18467044830322266, "learning_rate": 0.001, "loss": 1.8587, "step": 542500 }, { "epoch": 31.736561969936247, "grad_norm": 0.22399461269378662, "learning_rate": 0.001, "loss": 1.8664, "step": 542600 }, { "epoch": 31.74241094928935, "grad_norm": 0.17150337994098663, "learning_rate": 0.001, "loss": 1.8674, "step": 542700 }, { "epoch": 31.74825992864245, "grad_norm": 0.2606269419193268, "learning_rate": 0.001, "loss": 1.871, "step": 542800 }, { "epoch": 31.754108907995555, "grad_norm": 0.2604304552078247, "learning_rate": 0.001, "loss": 1.8699, "step": 542900 }, { "epoch": 31.75995788734866, "grad_norm": 0.20278285443782806, "learning_rate": 0.001, "loss": 1.8684, "step": 543000 }, { "epoch": 31.76580686670176, "grad_norm": 0.19407536089420319, "learning_rate": 0.001, "loss": 1.8623, "step": 543100 }, { "epoch": 31.771655846054863, "grad_norm": 0.17334355413913727, "learning_rate": 0.001, "loss": 1.863, "step": 543200 }, { "epoch": 31.777504825407966, "grad_norm": 0.2290433645248413, "learning_rate": 0.001, "loss": 1.8697, "step": 543300 }, { "epoch": 31.78335380476107, "grad_norm": 0.21410904824733734, "learning_rate": 0.001, "loss": 1.864, "step": 543400 }, { "epoch": 31.78920278411417, "grad_norm": 0.16832759976387024, "learning_rate": 0.001, "loss": 1.864, "step": 543500 }, { "epoch": 31.795051763467274, "grad_norm": 0.2238048017024994, "learning_rate": 0.001, "loss": 1.8678, "step": 543600 }, { "epoch": 31.800900742820378, "grad_norm": 0.22811929881572723, "learning_rate": 0.001, "loss": 1.8592, "step": 543700 }, { "epoch": 31.806749722173482, "grad_norm": 0.16935887932777405, "learning_rate": 0.001, "loss": 1.8665, "step": 543800 }, { "epoch": 31.812598701526582, "grad_norm": 0.21000336110591888, "learning_rate": 0.001, "loss": 1.8679, "step": 543900 }, { "epoch": 31.818447680879686, "grad_norm": 0.1730685532093048, "learning_rate": 0.001, "loss": 1.8659, "step": 544000 }, { "epoch": 31.82429666023279, "grad_norm": 0.21793466806411743, "learning_rate": 0.001, "loss": 1.8646, "step": 544100 }, { "epoch": 31.830145639585893, "grad_norm": 0.18563269078731537, "learning_rate": 0.001, "loss": 1.8658, "step": 544200 }, { "epoch": 31.835994618938994, "grad_norm": 0.16008979082107544, "learning_rate": 0.001, "loss": 1.868, "step": 544300 }, { "epoch": 31.841843598292098, "grad_norm": 0.1864756941795349, "learning_rate": 0.001, "loss": 1.8666, "step": 544400 }, { "epoch": 31.8476925776452, "grad_norm": 0.17948943376541138, "learning_rate": 0.001, "loss": 1.8676, "step": 544500 }, { "epoch": 31.853541556998305, "grad_norm": 0.19565977156162262, "learning_rate": 0.001, "loss": 1.8703, "step": 544600 }, { "epoch": 31.859390536351405, "grad_norm": 0.2675235867500305, "learning_rate": 0.001, "loss": 1.8665, "step": 544700 }, { "epoch": 31.86523951570451, "grad_norm": 0.22006510198116302, "learning_rate": 0.001, "loss": 1.864, "step": 544800 }, { "epoch": 31.871088495057613, "grad_norm": 0.21508097648620605, "learning_rate": 0.001, "loss": 1.8679, "step": 544900 }, { "epoch": 31.876937474410717, "grad_norm": 0.22518302500247955, "learning_rate": 0.001, "loss": 1.8666, "step": 545000 }, { "epoch": 31.882786453763817, "grad_norm": 0.26997795701026917, "learning_rate": 0.001, "loss": 1.8678, "step": 545100 }, { "epoch": 31.88863543311692, "grad_norm": 0.2122233510017395, "learning_rate": 0.001, "loss": 1.8662, "step": 545200 }, { "epoch": 31.894484412470025, "grad_norm": 0.269663006067276, "learning_rate": 0.001, "loss": 1.8662, "step": 545300 }, { "epoch": 31.90033339182313, "grad_norm": 0.1959758847951889, "learning_rate": 0.001, "loss": 1.8685, "step": 545400 }, { "epoch": 31.90618237117623, "grad_norm": 0.2369006723165512, "learning_rate": 0.001, "loss": 1.8652, "step": 545500 }, { "epoch": 31.912031350529332, "grad_norm": 0.20989878475666046, "learning_rate": 0.001, "loss": 1.8749, "step": 545600 }, { "epoch": 31.917880329882436, "grad_norm": 0.19565138220787048, "learning_rate": 0.001, "loss": 1.8736, "step": 545700 }, { "epoch": 31.92372930923554, "grad_norm": 0.19709600508213043, "learning_rate": 0.001, "loss": 1.8622, "step": 545800 }, { "epoch": 31.92957828858864, "grad_norm": 0.25079941749572754, "learning_rate": 0.001, "loss": 1.8597, "step": 545900 }, { "epoch": 31.935427267941744, "grad_norm": 0.2788575291633606, "learning_rate": 0.001, "loss": 1.8711, "step": 546000 }, { "epoch": 31.941276247294848, "grad_norm": 0.19986432790756226, "learning_rate": 0.001, "loss": 1.8611, "step": 546100 }, { "epoch": 31.947125226647948, "grad_norm": 0.16129951179027557, "learning_rate": 0.001, "loss": 1.8647, "step": 546200 }, { "epoch": 31.952974206001052, "grad_norm": 0.18092119693756104, "learning_rate": 0.001, "loss": 1.8665, "step": 546300 }, { "epoch": 31.958823185354156, "grad_norm": 0.17498362064361572, "learning_rate": 0.001, "loss": 1.8694, "step": 546400 }, { "epoch": 31.96467216470726, "grad_norm": 0.1852767914533615, "learning_rate": 0.001, "loss": 1.8664, "step": 546500 }, { "epoch": 31.97052114406036, "grad_norm": 0.17469073832035065, "learning_rate": 0.001, "loss": 1.8688, "step": 546600 }, { "epoch": 31.976370123413464, "grad_norm": 0.20039184391498566, "learning_rate": 0.001, "loss": 1.8676, "step": 546700 }, { "epoch": 31.982219102766567, "grad_norm": 0.1529209017753601, "learning_rate": 0.001, "loss": 1.8623, "step": 546800 }, { "epoch": 31.98806808211967, "grad_norm": 0.19555990397930145, "learning_rate": 0.001, "loss": 1.864, "step": 546900 }, { "epoch": 31.99391706147277, "grad_norm": 0.1841612309217453, "learning_rate": 0.001, "loss": 1.862, "step": 547000 }, { "epoch": 31.999766040825875, "grad_norm": 0.2780628204345703, "learning_rate": 0.001, "loss": 1.8651, "step": 547100 }, { "epoch": 32.00561502017898, "grad_norm": 0.19156382977962494, "learning_rate": 0.001, "loss": 1.8565, "step": 547200 }, { "epoch": 32.01146399953208, "grad_norm": 0.2263544797897339, "learning_rate": 0.001, "loss": 1.8503, "step": 547300 }, { "epoch": 32.01731297888519, "grad_norm": 0.17689257860183716, "learning_rate": 0.001, "loss": 1.8539, "step": 547400 }, { "epoch": 32.02316195823829, "grad_norm": 0.2153986096382141, "learning_rate": 0.001, "loss": 1.8573, "step": 547500 }, { "epoch": 32.02901093759139, "grad_norm": 0.21715541183948517, "learning_rate": 0.001, "loss": 1.8539, "step": 547600 }, { "epoch": 32.03485991694449, "grad_norm": 0.25639790296554565, "learning_rate": 0.001, "loss": 1.8555, "step": 547700 }, { "epoch": 32.040708896297595, "grad_norm": 0.17665225267410278, "learning_rate": 0.001, "loss": 1.8497, "step": 547800 }, { "epoch": 32.0465578756507, "grad_norm": 0.2335829883813858, "learning_rate": 0.001, "loss": 1.8524, "step": 547900 }, { "epoch": 32.0524068550038, "grad_norm": 0.17898398637771606, "learning_rate": 0.001, "loss": 1.8481, "step": 548000 }, { "epoch": 32.058255834356906, "grad_norm": 0.20375144481658936, "learning_rate": 0.001, "loss": 1.8522, "step": 548100 }, { "epoch": 32.06410481371001, "grad_norm": 0.20649060606956482, "learning_rate": 0.001, "loss": 1.8499, "step": 548200 }, { "epoch": 32.069953793063114, "grad_norm": 0.19640912115573883, "learning_rate": 0.001, "loss": 1.8512, "step": 548300 }, { "epoch": 32.07580277241621, "grad_norm": 0.188322514295578, "learning_rate": 0.001, "loss": 1.8503, "step": 548400 }, { "epoch": 32.081651751769314, "grad_norm": 0.20403724908828735, "learning_rate": 0.001, "loss": 1.8507, "step": 548500 }, { "epoch": 32.08750073112242, "grad_norm": 0.2122056484222412, "learning_rate": 0.001, "loss": 1.8496, "step": 548600 }, { "epoch": 32.09334971047552, "grad_norm": 0.22892408072948456, "learning_rate": 0.001, "loss": 1.8574, "step": 548700 }, { "epoch": 32.099198689828626, "grad_norm": 0.2247961461544037, "learning_rate": 0.001, "loss": 1.8494, "step": 548800 }, { "epoch": 32.10504766918173, "grad_norm": 0.160888209939003, "learning_rate": 0.001, "loss": 1.8532, "step": 548900 }, { "epoch": 32.11089664853483, "grad_norm": 0.2148161679506302, "learning_rate": 0.001, "loss": 1.8516, "step": 549000 }, { "epoch": 32.11674562788794, "grad_norm": 0.1992383450269699, "learning_rate": 0.001, "loss": 1.8516, "step": 549100 }, { "epoch": 32.122594607241034, "grad_norm": 0.16356557607650757, "learning_rate": 0.001, "loss": 1.8544, "step": 549200 }, { "epoch": 32.12844358659414, "grad_norm": 0.24519991874694824, "learning_rate": 0.001, "loss": 1.8498, "step": 549300 }, { "epoch": 32.13429256594724, "grad_norm": 0.211613729596138, "learning_rate": 0.001, "loss": 1.86, "step": 549400 }, { "epoch": 32.140141545300345, "grad_norm": 0.20996947586536407, "learning_rate": 0.001, "loss": 1.8532, "step": 549500 }, { "epoch": 32.14599052465345, "grad_norm": 0.30188578367233276, "learning_rate": 0.001, "loss": 1.8551, "step": 549600 }, { "epoch": 32.15183950400655, "grad_norm": 0.231881782412529, "learning_rate": 0.001, "loss": 1.858, "step": 549700 }, { "epoch": 32.15768848335966, "grad_norm": 0.20534053444862366, "learning_rate": 0.001, "loss": 1.8507, "step": 549800 }, { "epoch": 32.16353746271275, "grad_norm": 0.23370462656021118, "learning_rate": 0.001, "loss": 1.8547, "step": 549900 }, { "epoch": 32.16938644206586, "grad_norm": 0.18942365050315857, "learning_rate": 0.001, "loss": 1.8566, "step": 550000 }, { "epoch": 32.16938644206586, "eval_ag_news_accuracy": 0.24121875, "eval_ag_news_bleu_score": 7.200092914038313, "eval_ag_news_bleu_score_sem": 0.4573201685896354, "eval_ag_news_emb_cos_sim": 0.7032306790351868, "eval_ag_news_emb_cos_sim_sem": 0.014194902963936329, "eval_ag_news_emb_top1_equal": 0.90625, "eval_ag_news_emb_top1_equal_sem": 0.025864720344543457, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.702338933944702, "eval_ag_news_n_ngrams_match_1": 14.3203125, "eval_ag_news_n_ngrams_match_2": 4.0859375, "eval_ag_news_n_ngrams_match_3": 1.5546875, "eval_ag_news_num_pred_words": 47.28125, "eval_ag_news_num_true_words": 45.609375, "eval_ag_news_perplexity": 14.914575166862232, "eval_ag_news_pred_num_tokens": 74.1640625, "eval_ag_news_rouge_score": 0.2927707566932784, "eval_ag_news_runtime": 38.7154, "eval_ag_news_samples_per_second": 12.915, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3305154820122072, "eval_ag_news_token_set_f1_sem": 0.010276297822081287, "eval_ag_news_token_set_precision": 0.3093437119799365, "eval_ag_news_token_set_recall": 0.3733876458714014, "eval_ag_news_true_num_tokens": 64.5234375, "step": 550000 }, { "epoch": 32.16938644206586, "eval_anthropic_toxic_prompts_accuracy": 0.10046875, "eval_anthropic_toxic_prompts_bleu_score": 41.6853066809586, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.7042053830538935, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8808405995368958, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011206453666090965, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.15625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03221922171672519, "eval_anthropic_toxic_prompts_loss": 1.2224842309951782, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.65625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.859375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.7890625, "eval_anthropic_toxic_prompts_num_pred_words": 15.9296875, "eval_anthropic_toxic_prompts_num_true_words": 15.8203125, "eval_anthropic_toxic_prompts_perplexity": 3.39561275088735, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.71875, "eval_anthropic_toxic_prompts_rouge_score": 0.6788983870754823, "eval_anthropic_toxic_prompts_runtime": 30.1123, "eval_anthropic_toxic_prompts_samples_per_second": 16.604, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.6988975991778914, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.019984708185591882, "eval_anthropic_toxic_prompts_token_set_precision": 0.701729878582743, "eval_anthropic_toxic_prompts_token_set_recall": 0.7034841266653226, "eval_anthropic_toxic_prompts_true_num_tokens": 19.4296875, "step": 550000 }, { "epoch": 32.16938644206586, "eval_arxiv_accuracy": 0.37690625, "eval_arxiv_bleu_score": 1.6664201060572639, "eval_arxiv_bleu_score_sem": 0.1413005097990354, "eval_arxiv_emb_cos_sim": 0.47422555088996887, "eval_arxiv_emb_cos_sim_sem": 0.018941428512334824, "eval_arxiv_emb_top1_equal": 0.9140625, "eval_arxiv_emb_top1_equal_sem": 0.024870097637176514, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4255199432373047, "eval_arxiv_n_ngrams_match_1": 12.796875, "eval_arxiv_n_ngrams_match_2": 2.15625, "eval_arxiv_n_ngrams_match_3": 0.5, "eval_arxiv_num_pred_words": 54.71875, "eval_arxiv_num_true_words": 85.3125, "eval_arxiv_perplexity": 30.738623036440202, "eval_arxiv_pred_num_tokens": 125.5390625, "eval_arxiv_rouge_score": 0.16459074950030728, "eval_arxiv_runtime": 31.0301, "eval_arxiv_samples_per_second": 16.113, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.16768248062045002, "eval_arxiv_token_set_f1_sem": 0.008087692699514138, "eval_arxiv_token_set_precision": 0.11267937650358449, "eval_arxiv_token_set_recall": 0.38685104461335806, "eval_arxiv_true_num_tokens": 123.453125, "step": 550000 }, { "epoch": 32.16938644206586, "eval_python_code_alpaca_accuracy": 0.13221875, "eval_python_code_alpaca_bleu_score": 26.737695313402902, "eval_python_code_alpaca_bleu_score_sem": 1.6771546335849838, "eval_python_code_alpaca_emb_cos_sim": 0.858195424079895, "eval_python_code_alpaca_emb_cos_sim_sem": 0.012700248509645462, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4720029830932617, "eval_python_code_alpaca_n_ngrams_match_1": 10.8984375, "eval_python_code_alpaca_n_ngrams_match_2": 5.875, "eval_python_code_alpaca_n_ngrams_match_3": 3.265625, "eval_python_code_alpaca_num_pred_words": 18.6953125, "eval_python_code_alpaca_num_true_words": 20.5859375, "eval_python_code_alpaca_perplexity": 4.357955315784777, "eval_python_code_alpaca_pred_num_tokens": 26.78125, "eval_python_code_alpaca_rouge_score": 0.5835452049749609, "eval_python_code_alpaca_runtime": 29.6575, "eval_python_code_alpaca_samples_per_second": 16.859, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6029762349100513, "eval_python_code_alpaca_token_set_f1_sem": 0.014957507306885736, "eval_python_code_alpaca_token_set_precision": 0.5843519625862842, "eval_python_code_alpaca_token_set_recall": 0.6323783717621274, "eval_python_code_alpaca_true_num_tokens": 26.8125, "step": 550000 }, { "epoch": 32.16938644206586, "eval_wikibio_accuracy": 0.371328125, "eval_wikibio_bleu_score": 6.0638488723251385, "eval_wikibio_bleu_score_sem": 0.5662301953108335, "eval_wikibio_emb_cos_sim": 0.5648183226585388, "eval_wikibio_emb_cos_sim_sem": 0.02314544841647148, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6621921062469482, "eval_wikibio_n_ngrams_match_1": 13.765625, "eval_wikibio_n_ngrams_match_2": 4.3671875, "eval_wikibio_n_ngrams_match_3": 1.734375, "eval_wikibio_num_pred_words": 50.9296875, "eval_wikibio_num_true_words": 50.875, "eval_wikibio_perplexity": 14.32766245762378, "eval_wikibio_pred_num_tokens": 107.8984375, "eval_wikibio_rouge_score": 0.27118068934682105, "eval_wikibio_runtime": 31.369, "eval_wikibio_samples_per_second": 15.939, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.2996918530240936, "eval_wikibio_token_set_f1_sem": 0.011297840545128624, "eval_wikibio_token_set_precision": 0.2672119161092298, "eval_wikibio_token_set_recall": 0.3926876827962009, "eval_wikibio_true_num_tokens": 98.3828125, "step": 550000 }, { "epoch": 32.16938644206586, "eval_msmarco_accuracy": 0.39, "eval_msmarco_bleu_score": 16.420409208318333, "eval_msmarco_bleu_score_sem": 1.492518475557141, "eval_msmarco_emb_cos_sim": 0.7923615574836731, "eval_msmarco_emb_cos_sim_sem": 0.015810467302799225, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7496942281723022, "eval_msmarco_n_ngrams_match_1": 27.9453125, "eval_msmarco_n_ngrams_match_2": 12.53125, "eval_msmarco_n_ngrams_match_3": 7.0625, "eval_msmarco_num_pred_words": 62.640625, "eval_msmarco_num_true_words": 61.9453125, "eval_msmarco_perplexity": 5.752843349617744, "eval_msmarco_pred_num_tokens": 87.03125, "eval_msmarco_rouge_score": 0.4264129490457861, "eval_msmarco_runtime": 25.7879, "eval_msmarco_samples_per_second": 19.389, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.45997684251051524, "eval_msmarco_token_set_f1_sem": 0.0144455323767867, "eval_msmarco_token_set_precision": 0.4248056862045569, "eval_msmarco_token_set_recall": 0.5248090002114192, "eval_msmarco_true_num_tokens": 80.9375, "step": 550000 }, { "epoch": 32.17523542141896, "grad_norm": 0.21302750706672668, "learning_rate": 0.001, "loss": 1.8571, "step": 550100 }, { "epoch": 32.181084400772065, "grad_norm": 0.18326538801193237, "learning_rate": 0.001, "loss": 1.8556, "step": 550200 }, { "epoch": 32.18693338012517, "grad_norm": 0.2072814255952835, "learning_rate": 0.001, "loss": 1.8613, "step": 550300 }, { "epoch": 32.19278235947827, "grad_norm": 0.26479572057724, "learning_rate": 0.001, "loss": 1.8525, "step": 550400 }, { "epoch": 32.198631338831376, "grad_norm": 0.19722433388233185, "learning_rate": 0.001, "loss": 1.8532, "step": 550500 }, { "epoch": 32.20448031818448, "grad_norm": 0.20402891933918, "learning_rate": 0.001, "loss": 1.8547, "step": 550600 }, { "epoch": 32.21032929753758, "grad_norm": 0.17856933176517487, "learning_rate": 0.001, "loss": 1.8579, "step": 550700 }, { "epoch": 32.21617827689068, "grad_norm": 0.20509737730026245, "learning_rate": 0.001, "loss": 1.8567, "step": 550800 }, { "epoch": 32.222027256243784, "grad_norm": 0.23192881047725677, "learning_rate": 0.001, "loss": 1.8623, "step": 550900 }, { "epoch": 32.22787623559689, "grad_norm": 0.23167283833026886, "learning_rate": 0.001, "loss": 1.8599, "step": 551000 }, { "epoch": 32.23372521494999, "grad_norm": 0.19113211333751678, "learning_rate": 0.001, "loss": 1.855, "step": 551100 }, { "epoch": 32.239574194303096, "grad_norm": 0.1947174221277237, "learning_rate": 0.001, "loss": 1.8549, "step": 551200 }, { "epoch": 32.2454231736562, "grad_norm": 0.23228996992111206, "learning_rate": 0.001, "loss": 1.8594, "step": 551300 }, { "epoch": 32.2512721530093, "grad_norm": 0.22665616869926453, "learning_rate": 0.001, "loss": 1.8595, "step": 551400 }, { "epoch": 32.2571211323624, "grad_norm": 0.18931762874126434, "learning_rate": 0.001, "loss": 1.8476, "step": 551500 }, { "epoch": 32.262970111715504, "grad_norm": 0.20679572224617004, "learning_rate": 0.001, "loss": 1.8555, "step": 551600 }, { "epoch": 32.26881909106861, "grad_norm": 0.18089433014392853, "learning_rate": 0.001, "loss": 1.8557, "step": 551700 }, { "epoch": 32.27466807042171, "grad_norm": 0.22495689988136292, "learning_rate": 0.001, "loss": 1.8561, "step": 551800 }, { "epoch": 32.280517049774815, "grad_norm": 0.180611252784729, "learning_rate": 0.001, "loss": 1.8669, "step": 551900 }, { "epoch": 32.28636602912792, "grad_norm": 0.22620703279972076, "learning_rate": 0.001, "loss": 1.8594, "step": 552000 }, { "epoch": 32.29221500848102, "grad_norm": 0.1809864044189453, "learning_rate": 0.001, "loss": 1.8605, "step": 552100 }, { "epoch": 32.29806398783413, "grad_norm": 0.16228504478931427, "learning_rate": 0.001, "loss": 1.8574, "step": 552200 }, { "epoch": 32.30391296718722, "grad_norm": 0.19257883727550507, "learning_rate": 0.001, "loss": 1.856, "step": 552300 }, { "epoch": 32.30976194654033, "grad_norm": 0.2418150156736374, "learning_rate": 0.001, "loss": 1.8558, "step": 552400 }, { "epoch": 32.31561092589343, "grad_norm": 0.24573123455047607, "learning_rate": 0.001, "loss": 1.8586, "step": 552500 }, { "epoch": 32.321459905246535, "grad_norm": 0.22681622207164764, "learning_rate": 0.001, "loss": 1.8621, "step": 552600 }, { "epoch": 32.32730888459964, "grad_norm": 0.2408379763364792, "learning_rate": 0.001, "loss": 1.8604, "step": 552700 }, { "epoch": 32.33315786395274, "grad_norm": 0.2155425250530243, "learning_rate": 0.001, "loss": 1.8641, "step": 552800 }, { "epoch": 32.339006843305846, "grad_norm": 0.21611231565475464, "learning_rate": 0.001, "loss": 1.8505, "step": 552900 }, { "epoch": 32.34485582265894, "grad_norm": 0.1510585993528366, "learning_rate": 0.001, "loss": 1.8564, "step": 553000 }, { "epoch": 32.35070480201205, "grad_norm": 0.16761159896850586, "learning_rate": 0.001, "loss": 1.856, "step": 553100 }, { "epoch": 32.35655378136515, "grad_norm": 0.22524645924568176, "learning_rate": 0.001, "loss": 1.8603, "step": 553200 }, { "epoch": 32.362402760718254, "grad_norm": 0.1969112604856491, "learning_rate": 0.001, "loss": 1.8562, "step": 553300 }, { "epoch": 32.36825174007136, "grad_norm": 0.2654365599155426, "learning_rate": 0.001, "loss": 1.856, "step": 553400 }, { "epoch": 32.37410071942446, "grad_norm": 0.20683933794498444, "learning_rate": 0.001, "loss": 1.8595, "step": 553500 }, { "epoch": 32.379949698777565, "grad_norm": 0.23152458667755127, "learning_rate": 0.001, "loss": 1.8546, "step": 553600 }, { "epoch": 32.38579867813067, "grad_norm": 0.19842185080051422, "learning_rate": 0.001, "loss": 1.8599, "step": 553700 }, { "epoch": 32.391647657483766, "grad_norm": 0.22694385051727295, "learning_rate": 0.001, "loss": 1.8577, "step": 553800 }, { "epoch": 32.39749663683687, "grad_norm": 0.23638097941875458, "learning_rate": 0.001, "loss": 1.8593, "step": 553900 }, { "epoch": 32.403345616189974, "grad_norm": 0.26267340779304504, "learning_rate": 0.001, "loss": 1.8617, "step": 554000 }, { "epoch": 32.40919459554308, "grad_norm": 0.1813780963420868, "learning_rate": 0.001, "loss": 1.8615, "step": 554100 }, { "epoch": 32.41504357489618, "grad_norm": 0.2441914826631546, "learning_rate": 0.001, "loss": 1.8553, "step": 554200 }, { "epoch": 32.420892554249285, "grad_norm": 0.17385491728782654, "learning_rate": 0.001, "loss": 1.8586, "step": 554300 }, { "epoch": 32.42674153360239, "grad_norm": 0.25119927525520325, "learning_rate": 0.001, "loss": 1.8625, "step": 554400 }, { "epoch": 32.43259051295549, "grad_norm": 0.20436273515224457, "learning_rate": 0.001, "loss": 1.8613, "step": 554500 }, { "epoch": 32.43843949230859, "grad_norm": 0.17435213923454285, "learning_rate": 0.001, "loss": 1.8638, "step": 554600 }, { "epoch": 32.44428847166169, "grad_norm": 0.16072016954421997, "learning_rate": 0.001, "loss": 1.8572, "step": 554700 }, { "epoch": 32.4501374510148, "grad_norm": 0.18757139146327972, "learning_rate": 0.001, "loss": 1.8579, "step": 554800 }, { "epoch": 32.4559864303679, "grad_norm": 0.2202368825674057, "learning_rate": 0.001, "loss": 1.8638, "step": 554900 }, { "epoch": 32.461835409721004, "grad_norm": 0.17257681488990784, "learning_rate": 0.001, "loss": 1.8548, "step": 555000 }, { "epoch": 32.46768438907411, "grad_norm": 0.18763388693332672, "learning_rate": 0.001, "loss": 1.8587, "step": 555100 }, { "epoch": 32.47353336842721, "grad_norm": 0.2143271416425705, "learning_rate": 0.001, "loss": 1.8568, "step": 555200 }, { "epoch": 32.479382347780316, "grad_norm": 0.21010828018188477, "learning_rate": 0.001, "loss": 1.8631, "step": 555300 }, { "epoch": 32.48523132713341, "grad_norm": 0.2513583302497864, "learning_rate": 0.001, "loss": 1.8635, "step": 555400 }, { "epoch": 32.491080306486516, "grad_norm": 0.23989877104759216, "learning_rate": 0.001, "loss": 1.8583, "step": 555500 }, { "epoch": 32.49692928583962, "grad_norm": 0.2168557345867157, "learning_rate": 0.001, "loss": 1.8616, "step": 555600 }, { "epoch": 32.502778265192724, "grad_norm": 0.21851174533367157, "learning_rate": 0.001, "loss": 1.866, "step": 555700 }, { "epoch": 32.50862724454583, "grad_norm": 0.15260738134384155, "learning_rate": 0.001, "loss": 1.8577, "step": 555800 }, { "epoch": 32.51447622389893, "grad_norm": 0.23182129859924316, "learning_rate": 0.001, "loss": 1.858, "step": 555900 }, { "epoch": 32.520325203252035, "grad_norm": 0.21390751004219055, "learning_rate": 0.001, "loss": 1.8567, "step": 556000 }, { "epoch": 32.52617418260513, "grad_norm": 0.2106599062681198, "learning_rate": 0.001, "loss": 1.8628, "step": 556100 }, { "epoch": 32.532023161958236, "grad_norm": 0.2106183022260666, "learning_rate": 0.001, "loss": 1.8617, "step": 556200 }, { "epoch": 32.53787214131134, "grad_norm": 0.22142533957958221, "learning_rate": 0.001, "loss": 1.863, "step": 556300 }, { "epoch": 32.54372112066444, "grad_norm": 0.18693988025188446, "learning_rate": 0.001, "loss": 1.8646, "step": 556400 }, { "epoch": 32.54957010001755, "grad_norm": 0.2072215974330902, "learning_rate": 0.001, "loss": 1.8603, "step": 556500 }, { "epoch": 32.55541907937065, "grad_norm": 0.1943190097808838, "learning_rate": 0.001, "loss": 1.8596, "step": 556600 }, { "epoch": 32.561268058723755, "grad_norm": 0.2729136347770691, "learning_rate": 0.001, "loss": 1.8667, "step": 556700 }, { "epoch": 32.56711703807686, "grad_norm": 0.17801730334758759, "learning_rate": 0.001, "loss": 1.8653, "step": 556800 }, { "epoch": 32.572966017429955, "grad_norm": 0.18259882926940918, "learning_rate": 0.001, "loss": 1.8546, "step": 556900 }, { "epoch": 32.57881499678306, "grad_norm": 0.1772671788930893, "learning_rate": 0.001, "loss": 1.8619, "step": 557000 }, { "epoch": 32.58466397613616, "grad_norm": 0.2064024955034256, "learning_rate": 0.001, "loss": 1.8628, "step": 557100 }, { "epoch": 32.59051295548927, "grad_norm": 0.2206934541463852, "learning_rate": 0.001, "loss": 1.861, "step": 557200 }, { "epoch": 32.59636193484237, "grad_norm": 0.18969400227069855, "learning_rate": 0.001, "loss": 1.8628, "step": 557300 }, { "epoch": 32.602210914195474, "grad_norm": 0.25348228216171265, "learning_rate": 0.001, "loss": 1.8641, "step": 557400 }, { "epoch": 32.60805989354858, "grad_norm": 0.20977698266506195, "learning_rate": 0.001, "loss": 1.8675, "step": 557500 }, { "epoch": 32.61390887290168, "grad_norm": 0.1928619146347046, "learning_rate": 0.001, "loss": 1.8583, "step": 557600 }, { "epoch": 32.61975785225478, "grad_norm": 0.1675405651330948, "learning_rate": 0.001, "loss": 1.8557, "step": 557700 }, { "epoch": 32.62560683160788, "grad_norm": 0.2386571168899536, "learning_rate": 0.001, "loss": 1.8632, "step": 557800 }, { "epoch": 32.631455810960986, "grad_norm": 0.25078216195106506, "learning_rate": 0.001, "loss": 1.8709, "step": 557900 }, { "epoch": 32.63730479031409, "grad_norm": 0.181132510304451, "learning_rate": 0.001, "loss": 1.8613, "step": 558000 }, { "epoch": 32.643153769667194, "grad_norm": 0.22592197358608246, "learning_rate": 0.001, "loss": 1.8642, "step": 558100 }, { "epoch": 32.6490027490203, "grad_norm": 0.2048276662826538, "learning_rate": 0.001, "loss": 1.8625, "step": 558200 }, { "epoch": 32.6548517283734, "grad_norm": 0.20066767930984497, "learning_rate": 0.001, "loss": 1.8626, "step": 558300 }, { "epoch": 32.660700707726505, "grad_norm": 0.20560257136821747, "learning_rate": 0.001, "loss": 1.8651, "step": 558400 }, { "epoch": 32.6665496870796, "grad_norm": 0.2561293840408325, "learning_rate": 0.001, "loss": 1.8687, "step": 558500 }, { "epoch": 32.672398666432706, "grad_norm": 0.15508820116519928, "learning_rate": 0.001, "loss": 1.8647, "step": 558600 }, { "epoch": 32.67824764578581, "grad_norm": 0.20123755931854248, "learning_rate": 0.001, "loss": 1.858, "step": 558700 }, { "epoch": 32.68409662513891, "grad_norm": 0.2668260633945465, "learning_rate": 0.001, "loss": 1.8643, "step": 558800 }, { "epoch": 32.68994560449202, "grad_norm": 0.18110507726669312, "learning_rate": 0.001, "loss": 1.8632, "step": 558900 }, { "epoch": 32.69579458384512, "grad_norm": 0.21572931110858917, "learning_rate": 0.001, "loss": 1.861, "step": 559000 }, { "epoch": 32.701643563198225, "grad_norm": 0.21669824421405792, "learning_rate": 0.001, "loss": 1.8641, "step": 559100 }, { "epoch": 32.70749254255132, "grad_norm": 0.1800433099269867, "learning_rate": 0.001, "loss": 1.8668, "step": 559200 }, { "epoch": 32.713341521904425, "grad_norm": 0.17459167540073395, "learning_rate": 0.001, "loss": 1.86, "step": 559300 }, { "epoch": 32.71919050125753, "grad_norm": 0.18337757885456085, "learning_rate": 0.001, "loss": 1.8543, "step": 559400 }, { "epoch": 32.72503948061063, "grad_norm": 0.2439100295305252, "learning_rate": 0.001, "loss": 1.8643, "step": 559500 }, { "epoch": 32.73088845996374, "grad_norm": 0.17568060755729675, "learning_rate": 0.001, "loss": 1.8636, "step": 559600 }, { "epoch": 32.73673743931684, "grad_norm": 0.22935718297958374, "learning_rate": 0.001, "loss": 1.8632, "step": 559700 }, { "epoch": 32.742586418669944, "grad_norm": 0.2970947325229645, "learning_rate": 0.001, "loss": 1.8612, "step": 559800 }, { "epoch": 32.74843539802305, "grad_norm": 0.19933156669139862, "learning_rate": 0.001, "loss": 1.8605, "step": 559900 }, { "epoch": 32.754284377376145, "grad_norm": 0.21639852225780487, "learning_rate": 0.001, "loss": 1.8676, "step": 560000 }, { "epoch": 32.76013335672925, "grad_norm": 0.1970139741897583, "learning_rate": 0.001, "loss": 1.8655, "step": 560100 }, { "epoch": 32.76598233608235, "grad_norm": 0.21714523434638977, "learning_rate": 0.001, "loss": 1.8653, "step": 560200 }, { "epoch": 32.771831315435456, "grad_norm": 0.19853709638118744, "learning_rate": 0.001, "loss": 1.8614, "step": 560300 }, { "epoch": 32.77768029478856, "grad_norm": 0.22486434876918793, "learning_rate": 0.001, "loss": 1.8688, "step": 560400 }, { "epoch": 32.783529274141664, "grad_norm": 0.20894639194011688, "learning_rate": 0.001, "loss": 1.868, "step": 560500 }, { "epoch": 32.78937825349477, "grad_norm": 0.16535750031471252, "learning_rate": 0.001, "loss": 1.8602, "step": 560600 }, { "epoch": 32.79522723284787, "grad_norm": 0.22626405954360962, "learning_rate": 0.001, "loss": 1.8604, "step": 560700 }, { "epoch": 32.80107621220097, "grad_norm": 0.15302181243896484, "learning_rate": 0.001, "loss": 1.8608, "step": 560800 }, { "epoch": 32.80692519155407, "grad_norm": 0.2484154999256134, "learning_rate": 0.001, "loss": 1.8596, "step": 560900 }, { "epoch": 32.812774170907176, "grad_norm": 0.17403532564640045, "learning_rate": 0.001, "loss": 1.8639, "step": 561000 }, { "epoch": 32.81862315026028, "grad_norm": 0.24460521340370178, "learning_rate": 0.001, "loss": 1.8565, "step": 561100 }, { "epoch": 32.82447212961338, "grad_norm": 0.18789620697498322, "learning_rate": 0.001, "loss": 1.8564, "step": 561200 }, { "epoch": 32.83032110896649, "grad_norm": 0.28790417313575745, "learning_rate": 0.001, "loss": 1.8606, "step": 561300 }, { "epoch": 32.83617008831959, "grad_norm": 0.20237350463867188, "learning_rate": 0.001, "loss": 1.8615, "step": 561400 }, { "epoch": 32.842019067672695, "grad_norm": 0.21003146469593048, "learning_rate": 0.001, "loss": 1.8599, "step": 561500 }, { "epoch": 32.84786804702579, "grad_norm": 0.18598824739456177, "learning_rate": 0.001, "loss": 1.8613, "step": 561600 }, { "epoch": 32.853717026378895, "grad_norm": 0.22012315690517426, "learning_rate": 0.001, "loss": 1.8646, "step": 561700 }, { "epoch": 32.859566005732, "grad_norm": 0.19905029237270355, "learning_rate": 0.001, "loss": 1.8685, "step": 561800 }, { "epoch": 32.8654149850851, "grad_norm": 0.18540455400943756, "learning_rate": 0.001, "loss": 1.8642, "step": 561900 }, { "epoch": 32.87126396443821, "grad_norm": 0.1863251030445099, "learning_rate": 0.001, "loss": 1.8698, "step": 562000 }, { "epoch": 32.87711294379131, "grad_norm": 0.19848251342773438, "learning_rate": 0.001, "loss": 1.8588, "step": 562100 }, { "epoch": 32.882961923144414, "grad_norm": 0.24628886580467224, "learning_rate": 0.001, "loss": 1.8649, "step": 562200 }, { "epoch": 32.88881090249751, "grad_norm": 0.17930282652378082, "learning_rate": 0.001, "loss": 1.8632, "step": 562300 }, { "epoch": 32.894659881850615, "grad_norm": 0.20482727885246277, "learning_rate": 0.001, "loss": 1.8651, "step": 562400 }, { "epoch": 32.90050886120372, "grad_norm": 0.21987536549568176, "learning_rate": 0.001, "loss": 1.8628, "step": 562500 }, { "epoch": 32.90635784055682, "grad_norm": 0.22500520944595337, "learning_rate": 0.001, "loss": 1.8673, "step": 562600 }, { "epoch": 32.912206819909926, "grad_norm": 0.22388598322868347, "learning_rate": 0.001, "loss": 1.8619, "step": 562700 }, { "epoch": 32.91805579926303, "grad_norm": 0.2280515432357788, "learning_rate": 0.001, "loss": 1.8633, "step": 562800 }, { "epoch": 32.923904778616134, "grad_norm": 0.185227632522583, "learning_rate": 0.001, "loss": 1.8552, "step": 562900 }, { "epoch": 32.92975375796924, "grad_norm": 0.19820882380008698, "learning_rate": 0.001, "loss": 1.8619, "step": 563000 }, { "epoch": 32.935602737322334, "grad_norm": 0.23234154284000397, "learning_rate": 0.001, "loss": 1.8648, "step": 563100 }, { "epoch": 32.94145171667544, "grad_norm": 0.19733047485351562, "learning_rate": 0.001, "loss": 1.8596, "step": 563200 }, { "epoch": 32.94730069602854, "grad_norm": 0.19371478259563446, "learning_rate": 0.001, "loss": 1.8672, "step": 563300 }, { "epoch": 32.953149675381646, "grad_norm": 0.22758375108242035, "learning_rate": 0.001, "loss": 1.8661, "step": 563400 }, { "epoch": 32.95899865473475, "grad_norm": 0.19380861520767212, "learning_rate": 0.001, "loss": 1.8649, "step": 563500 }, { "epoch": 32.96484763408785, "grad_norm": 0.19214923679828644, "learning_rate": 0.001, "loss": 1.8599, "step": 563600 }, { "epoch": 32.97069661344096, "grad_norm": 0.18628250062465668, "learning_rate": 0.001, "loss": 1.8598, "step": 563700 }, { "epoch": 32.97654559279406, "grad_norm": 0.18193577229976654, "learning_rate": 0.001, "loss": 1.8585, "step": 563800 }, { "epoch": 32.98239457214716, "grad_norm": 0.21360023319721222, "learning_rate": 0.001, "loss": 1.8724, "step": 563900 }, { "epoch": 32.98824355150026, "grad_norm": 0.18951548635959625, "learning_rate": 0.001, "loss": 1.8616, "step": 564000 }, { "epoch": 32.994092530853365, "grad_norm": 0.21239525079727173, "learning_rate": 0.001, "loss": 1.8646, "step": 564100 }, { "epoch": 32.99994151020647, "grad_norm": 0.21810096502304077, "learning_rate": 0.001, "loss": 1.866, "step": 564200 }, { "epoch": 33.00579048955957, "grad_norm": 0.23236297070980072, "learning_rate": 0.001, "loss": 1.8538, "step": 564300 }, { "epoch": 33.011639468912676, "grad_norm": 0.23823557794094086, "learning_rate": 0.001, "loss": 1.8475, "step": 564400 }, { "epoch": 33.01748844826578, "grad_norm": 0.18243907392024994, "learning_rate": 0.001, "loss": 1.848, "step": 564500 }, { "epoch": 33.023337427618884, "grad_norm": 0.1971290558576584, "learning_rate": 0.001, "loss": 1.8438, "step": 564600 }, { "epoch": 33.02918640697198, "grad_norm": 0.18155431747436523, "learning_rate": 0.001, "loss": 1.8468, "step": 564700 }, { "epoch": 33.035035386325085, "grad_norm": 0.22198018431663513, "learning_rate": 0.001, "loss": 1.8567, "step": 564800 }, { "epoch": 33.04088436567819, "grad_norm": 0.24423418939113617, "learning_rate": 0.001, "loss": 1.8527, "step": 564900 }, { "epoch": 33.04673334503129, "grad_norm": 0.21251866221427917, "learning_rate": 0.001, "loss": 1.8487, "step": 565000 }, { "epoch": 33.052582324384396, "grad_norm": 0.2410731315612793, "learning_rate": 0.001, "loss": 1.8517, "step": 565100 }, { "epoch": 33.0584313037375, "grad_norm": 0.1946639120578766, "learning_rate": 0.001, "loss": 1.8472, "step": 565200 }, { "epoch": 33.0642802830906, "grad_norm": 0.2161322981119156, "learning_rate": 0.001, "loss": 1.8505, "step": 565300 }, { "epoch": 33.0701292624437, "grad_norm": 0.24778103828430176, "learning_rate": 0.001, "loss": 1.8469, "step": 565400 }, { "epoch": 33.075978241796804, "grad_norm": 0.2782914340496063, "learning_rate": 0.001, "loss": 1.8503, "step": 565500 }, { "epoch": 33.08182722114991, "grad_norm": 0.3060348331928253, "learning_rate": 0.001, "loss": 1.8537, "step": 565600 }, { "epoch": 33.08767620050301, "grad_norm": 0.19986672699451447, "learning_rate": 0.001, "loss": 1.8475, "step": 565700 }, { "epoch": 33.093525179856115, "grad_norm": 0.20963360369205475, "learning_rate": 0.001, "loss": 1.8469, "step": 565800 }, { "epoch": 33.09937415920922, "grad_norm": 0.24762728810310364, "learning_rate": 0.001, "loss": 1.8471, "step": 565900 }, { "epoch": 33.10522313856232, "grad_norm": 0.2006673514842987, "learning_rate": 0.001, "loss": 1.8523, "step": 566000 }, { "epoch": 33.11107211791543, "grad_norm": 0.25853925943374634, "learning_rate": 0.001, "loss": 1.8533, "step": 566100 }, { "epoch": 33.11692109726852, "grad_norm": 0.2405375987291336, "learning_rate": 0.001, "loss": 1.8579, "step": 566200 }, { "epoch": 33.12277007662163, "grad_norm": 0.21075387299060822, "learning_rate": 0.001, "loss": 1.8549, "step": 566300 }, { "epoch": 33.12861905597473, "grad_norm": 0.20698092877864838, "learning_rate": 0.001, "loss": 1.8509, "step": 566400 }, { "epoch": 33.134468035327835, "grad_norm": 0.21399502456188202, "learning_rate": 0.001, "loss": 1.8489, "step": 566500 }, { "epoch": 33.14031701468094, "grad_norm": 0.2004701942205429, "learning_rate": 0.001, "loss": 1.8507, "step": 566600 }, { "epoch": 33.14616599403404, "grad_norm": 0.2576623558998108, "learning_rate": 0.001, "loss": 1.8537, "step": 566700 }, { "epoch": 33.152014973387146, "grad_norm": 0.23459239304065704, "learning_rate": 0.001, "loss": 1.8527, "step": 566800 }, { "epoch": 33.15786395274025, "grad_norm": 0.19826653599739075, "learning_rate": 0.001, "loss": 1.8521, "step": 566900 }, { "epoch": 33.16371293209335, "grad_norm": 0.30504217743873596, "learning_rate": 0.001, "loss": 1.8545, "step": 567000 }, { "epoch": 33.16956191144645, "grad_norm": 0.30830028653144836, "learning_rate": 0.001, "loss": 1.8508, "step": 567100 }, { "epoch": 33.175410890799554, "grad_norm": 0.20658093690872192, "learning_rate": 0.001, "loss": 1.8569, "step": 567200 }, { "epoch": 33.18125987015266, "grad_norm": 0.18622596561908722, "learning_rate": 0.001, "loss": 1.8549, "step": 567300 }, { "epoch": 33.18710884950576, "grad_norm": 0.18327903747558594, "learning_rate": 0.001, "loss": 1.8551, "step": 567400 }, { "epoch": 33.192957828858866, "grad_norm": 0.21330595016479492, "learning_rate": 0.001, "loss": 1.8434, "step": 567500 }, { "epoch": 33.19880680821197, "grad_norm": 0.19640237092971802, "learning_rate": 0.001, "loss": 1.8472, "step": 567600 }, { "epoch": 33.20465578756507, "grad_norm": 0.21603210270404816, "learning_rate": 0.001, "loss": 1.8446, "step": 567700 }, { "epoch": 33.21050476691817, "grad_norm": 0.26013633608818054, "learning_rate": 0.001, "loss": 1.8514, "step": 567800 }, { "epoch": 33.216353746271274, "grad_norm": 0.2554014325141907, "learning_rate": 0.001, "loss": 1.8525, "step": 567900 }, { "epoch": 33.22220272562438, "grad_norm": 0.3101259768009186, "learning_rate": 0.001, "loss": 1.855, "step": 568000 }, { "epoch": 33.22805170497748, "grad_norm": 0.19756819307804108, "learning_rate": 0.001, "loss": 1.8572, "step": 568100 }, { "epoch": 33.233900684330585, "grad_norm": 0.2213662564754486, "learning_rate": 0.001, "loss": 1.8546, "step": 568200 }, { "epoch": 33.23974966368369, "grad_norm": 0.19769784808158875, "learning_rate": 0.001, "loss": 1.8487, "step": 568300 }, { "epoch": 33.24559864303679, "grad_norm": 0.23660001158714294, "learning_rate": 0.001, "loss": 1.854, "step": 568400 }, { "epoch": 33.25144762238989, "grad_norm": 0.24117259681224823, "learning_rate": 0.001, "loss": 1.8487, "step": 568500 }, { "epoch": 33.25729660174299, "grad_norm": 0.2838766574859619, "learning_rate": 0.001, "loss": 1.8507, "step": 568600 }, { "epoch": 33.2631455810961, "grad_norm": 0.25821757316589355, "learning_rate": 0.001, "loss": 1.8597, "step": 568700 }, { "epoch": 33.2689945604492, "grad_norm": 0.21180541813373566, "learning_rate": 0.001, "loss": 1.8493, "step": 568800 }, { "epoch": 33.274843539802305, "grad_norm": 0.2237756997346878, "learning_rate": 0.001, "loss": 1.8638, "step": 568900 }, { "epoch": 33.28069251915541, "grad_norm": 0.2186642587184906, "learning_rate": 0.001, "loss": 1.8551, "step": 569000 }, { "epoch": 33.28654149850851, "grad_norm": 0.21230249106884003, "learning_rate": 0.001, "loss": 1.8551, "step": 569100 }, { "epoch": 33.292390477861616, "grad_norm": 0.22339816391468048, "learning_rate": 0.001, "loss": 1.8554, "step": 569200 }, { "epoch": 33.29823945721471, "grad_norm": 0.25609317421913147, "learning_rate": 0.001, "loss": 1.8524, "step": 569300 }, { "epoch": 33.30408843656782, "grad_norm": 0.20811963081359863, "learning_rate": 0.001, "loss": 1.8519, "step": 569400 }, { "epoch": 33.30993741592092, "grad_norm": 0.20572112500667572, "learning_rate": 0.001, "loss": 1.8525, "step": 569500 }, { "epoch": 33.315786395274024, "grad_norm": 0.18352796137332916, "learning_rate": 0.001, "loss": 1.8494, "step": 569600 }, { "epoch": 33.32163537462713, "grad_norm": 0.20105098187923431, "learning_rate": 0.001, "loss": 1.8613, "step": 569700 }, { "epoch": 33.32748435398023, "grad_norm": 0.2226068526506424, "learning_rate": 0.001, "loss": 1.8565, "step": 569800 }, { "epoch": 33.333333333333336, "grad_norm": 0.23279763758182526, "learning_rate": 0.001, "loss": 1.8573, "step": 569900 }, { "epoch": 33.33918231268644, "grad_norm": 0.25508707761764526, "learning_rate": 0.001, "loss": 1.8543, "step": 570000 }, { "epoch": 33.345031292039536, "grad_norm": 0.2556733191013336, "learning_rate": 0.001, "loss": 1.858, "step": 570100 }, { "epoch": 33.35088027139264, "grad_norm": 0.22541344165802002, "learning_rate": 0.001, "loss": 1.8531, "step": 570200 }, { "epoch": 33.356729250745744, "grad_norm": 0.2669758200645447, "learning_rate": 0.001, "loss": 1.8598, "step": 570300 }, { "epoch": 33.36257823009885, "grad_norm": 0.270149827003479, "learning_rate": 0.001, "loss": 1.8503, "step": 570400 }, { "epoch": 33.36842720945195, "grad_norm": 0.22461917996406555, "learning_rate": 0.001, "loss": 1.8569, "step": 570500 }, { "epoch": 33.374276188805055, "grad_norm": 0.20913472771644592, "learning_rate": 0.001, "loss": 1.8558, "step": 570600 }, { "epoch": 33.38012516815816, "grad_norm": 0.17277874052524567, "learning_rate": 0.001, "loss": 1.8517, "step": 570700 }, { "epoch": 33.38597414751126, "grad_norm": 0.2169635146856308, "learning_rate": 0.001, "loss": 1.8557, "step": 570800 }, { "epoch": 33.39182312686436, "grad_norm": 0.21152202785015106, "learning_rate": 0.001, "loss": 1.8618, "step": 570900 }, { "epoch": 33.39767210621746, "grad_norm": 0.24137909710407257, "learning_rate": 0.001, "loss": 1.861, "step": 571000 }, { "epoch": 33.40352108557057, "grad_norm": 0.20119978487491608, "learning_rate": 0.001, "loss": 1.8581, "step": 571100 }, { "epoch": 33.40937006492367, "grad_norm": 0.24542225897312164, "learning_rate": 0.001, "loss": 1.8569, "step": 571200 }, { "epoch": 33.415219044276775, "grad_norm": 0.16214565932750702, "learning_rate": 0.001, "loss": 1.8586, "step": 571300 }, { "epoch": 33.42106802362988, "grad_norm": 0.18829260766506195, "learning_rate": 0.001, "loss": 1.8565, "step": 571400 }, { "epoch": 33.42691700298298, "grad_norm": 0.2381541132926941, "learning_rate": 0.001, "loss": 1.8513, "step": 571500 }, { "epoch": 33.43276598233608, "grad_norm": 0.245167538523674, "learning_rate": 0.001, "loss": 1.8521, "step": 571600 }, { "epoch": 33.43861496168918, "grad_norm": 0.2150796800851822, "learning_rate": 0.001, "loss": 1.8573, "step": 571700 }, { "epoch": 33.44446394104229, "grad_norm": 0.21646015346050262, "learning_rate": 0.001, "loss": 1.8604, "step": 571800 }, { "epoch": 33.45031292039539, "grad_norm": 0.28593710064888, "learning_rate": 0.001, "loss": 1.8524, "step": 571900 }, { "epoch": 33.456161899748494, "grad_norm": 0.2240530550479889, "learning_rate": 0.001, "loss": 1.8552, "step": 572000 }, { "epoch": 33.4620108791016, "grad_norm": 0.22253789007663727, "learning_rate": 0.001, "loss": 1.8496, "step": 572100 }, { "epoch": 33.4678598584547, "grad_norm": 0.23691552877426147, "learning_rate": 0.001, "loss": 1.8557, "step": 572200 }, { "epoch": 33.473708837807806, "grad_norm": 0.225687175989151, "learning_rate": 0.001, "loss": 1.853, "step": 572300 }, { "epoch": 33.4795578171609, "grad_norm": 0.2139521688222885, "learning_rate": 0.001, "loss": 1.8517, "step": 572400 }, { "epoch": 33.485406796514006, "grad_norm": 0.18671905994415283, "learning_rate": 0.001, "loss": 1.8501, "step": 572500 }, { "epoch": 33.49125577586711, "grad_norm": 0.19661261141300201, "learning_rate": 0.001, "loss": 1.8571, "step": 572600 }, { "epoch": 33.497104755220214, "grad_norm": 0.24217726290225983, "learning_rate": 0.001, "loss": 1.8594, "step": 572700 }, { "epoch": 33.50295373457332, "grad_norm": 0.2057289332151413, "learning_rate": 0.001, "loss": 1.8523, "step": 572800 }, { "epoch": 33.50880271392642, "grad_norm": 0.2704857885837555, "learning_rate": 0.001, "loss": 1.8507, "step": 572900 }, { "epoch": 33.514651693279525, "grad_norm": 0.2509515583515167, "learning_rate": 0.001, "loss": 1.8616, "step": 573000 }, { "epoch": 33.52050067263263, "grad_norm": 0.2356201559305191, "learning_rate": 0.001, "loss": 1.8528, "step": 573100 }, { "epoch": 33.526349651985726, "grad_norm": 0.24122226238250732, "learning_rate": 0.001, "loss": 1.8566, "step": 573200 }, { "epoch": 33.53219863133883, "grad_norm": 0.4554457664489746, "learning_rate": 0.001, "loss": 1.8546, "step": 573300 }, { "epoch": 33.53804761069193, "grad_norm": 0.4220179319381714, "learning_rate": 0.001, "loss": 1.8576, "step": 573400 }, { "epoch": 33.54389659004504, "grad_norm": 0.19933272898197174, "learning_rate": 0.001, "loss": 1.8577, "step": 573500 }, { "epoch": 33.54974556939814, "grad_norm": 0.23180894553661346, "learning_rate": 0.001, "loss": 1.858, "step": 573600 }, { "epoch": 33.555594548751245, "grad_norm": 0.21973302960395813, "learning_rate": 0.001, "loss": 1.865, "step": 573700 }, { "epoch": 33.56144352810435, "grad_norm": 0.22362208366394043, "learning_rate": 0.001, "loss": 1.8548, "step": 573800 }, { "epoch": 33.56729250745745, "grad_norm": 0.25676044821739197, "learning_rate": 0.001, "loss": 1.8543, "step": 573900 }, { "epoch": 33.57314148681055, "grad_norm": 0.20124579966068268, "learning_rate": 0.001, "loss": 1.8572, "step": 574000 }, { "epoch": 33.57899046616365, "grad_norm": 0.2330150455236435, "learning_rate": 0.001, "loss": 1.858, "step": 574100 }, { "epoch": 33.58483944551676, "grad_norm": 0.19916164875030518, "learning_rate": 0.001, "loss": 1.8542, "step": 574200 }, { "epoch": 33.59068842486986, "grad_norm": 0.18642397224903107, "learning_rate": 0.001, "loss": 1.8568, "step": 574300 }, { "epoch": 33.596537404222964, "grad_norm": 0.25373464822769165, "learning_rate": 0.001, "loss": 1.859, "step": 574400 }, { "epoch": 33.60238638357607, "grad_norm": 0.2207375168800354, "learning_rate": 0.001, "loss": 1.8538, "step": 574500 }, { "epoch": 33.60823536292917, "grad_norm": 0.2049117088317871, "learning_rate": 0.001, "loss": 1.8533, "step": 574600 }, { "epoch": 33.61408434228227, "grad_norm": 0.2610439658164978, "learning_rate": 0.001, "loss": 1.8609, "step": 574700 }, { "epoch": 33.61993332163537, "grad_norm": 0.18492580950260162, "learning_rate": 0.001, "loss": 1.8582, "step": 574800 }, { "epoch": 33.625782300988476, "grad_norm": 0.217959463596344, "learning_rate": 0.001, "loss": 1.8571, "step": 574900 }, { "epoch": 33.63163128034158, "grad_norm": 0.24737493693828583, "learning_rate": 0.001, "loss": 1.8603, "step": 575000 }, { "epoch": 33.63163128034158, "eval_ag_news_accuracy": 0.23728125, "eval_ag_news_bleu_score": 6.853849834474335, "eval_ag_news_bleu_score_sem": 0.4454054228488862, "eval_ag_news_emb_cos_sim": 0.7220637798309326, "eval_ag_news_emb_cos_sim_sem": 0.013994506560266018, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7324864864349365, "eval_ag_news_n_ngrams_match_1": 13.8359375, "eval_ag_news_n_ngrams_match_2": 4.015625, "eval_ag_news_n_ngrams_match_3": 1.5546875, "eval_ag_news_num_pred_words": 45.046875, "eval_ag_news_num_true_words": 42.9453125, "eval_ag_news_perplexity": 15.371059473467463, "eval_ag_news_pred_num_tokens": 67.96875, "eval_ag_news_rouge_score": 0.3063882964745428, "eval_ag_news_runtime": 39.2243, "eval_ag_news_samples_per_second": 12.747, "eval_ag_news_steps_per_second": 0.025, "eval_ag_news_token_set_f1": 0.33801677848431955, "eval_ag_news_token_set_f1_sem": 0.009972978239318234, "eval_ag_news_token_set_precision": 0.32385325816517435, "eval_ag_news_token_set_recall": 0.3601064838341586, "eval_ag_news_true_num_tokens": 59.796875, "step": 575000 }, { "epoch": 33.63163128034158, "eval_anthropic_toxic_prompts_accuracy": 0.102109375, "eval_anthropic_toxic_prompts_bleu_score": 44.02423667415165, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.891240639199947, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8953338861465454, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009354286827147007, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1953125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03517845700641747, "eval_anthropic_toxic_prompts_loss": 1.2422962188720703, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.96875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.0625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.984375, "eval_anthropic_toxic_prompts_num_pred_words": 16.6953125, "eval_anthropic_toxic_prompts_num_true_words": 16.484375, "eval_anthropic_toxic_prompts_perplexity": 3.4635574266622022, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.390625, "eval_anthropic_toxic_prompts_rouge_score": 0.6972035122211939, "eval_anthropic_toxic_prompts_runtime": 31.882, "eval_anthropic_toxic_prompts_samples_per_second": 15.683, "eval_anthropic_toxic_prompts_steps_per_second": 0.031, "eval_anthropic_toxic_prompts_token_set_f1": 0.7129592555477665, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01963574234302721, "eval_anthropic_toxic_prompts_token_set_precision": 0.7106408656857803, "eval_anthropic_toxic_prompts_token_set_recall": 0.7208756949659457, "eval_anthropic_toxic_prompts_true_num_tokens": 20.1640625, "step": 575000 }, { "epoch": 33.63163128034158, "eval_arxiv_accuracy": 0.37390625, "eval_arxiv_bleu_score": 1.778608396112267, "eval_arxiv_bleu_score_sem": 0.15595316188615846, "eval_arxiv_emb_cos_sim": 0.4547765552997589, "eval_arxiv_emb_cos_sim_sem": 0.01928488165140152, "eval_arxiv_emb_top1_equal": 0.8984375, "eval_arxiv_emb_top1_equal_sem": 0.026804566383361816, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.452687978744507, "eval_arxiv_n_ngrams_match_1": 13.8125, "eval_arxiv_n_ngrams_match_2": 2.4921875, "eval_arxiv_n_ngrams_match_3": 0.5703125, "eval_arxiv_num_pred_words": 55.1640625, "eval_arxiv_num_true_words": 87.2421875, "eval_arxiv_perplexity": 31.585178594534458, "eval_arxiv_pred_num_tokens": 125.7734375, "eval_arxiv_rouge_score": 0.17961660074567984, "eval_arxiv_runtime": 31.626, "eval_arxiv_samples_per_second": 15.81, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.18114002198382415, "eval_arxiv_token_set_f1_sem": 0.009120418266943332, "eval_arxiv_token_set_precision": 0.1244302491174452, "eval_arxiv_token_set_recall": 0.41204228266827597, "eval_arxiv_true_num_tokens": 125.375, "step": 575000 }, { "epoch": 33.63163128034158, "eval_python_code_alpaca_accuracy": 0.131140625, "eval_python_code_alpaca_bleu_score": 27.913260432060838, "eval_python_code_alpaca_bleu_score_sem": 1.683955236435946, "eval_python_code_alpaca_emb_cos_sim": 0.8697555065155029, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009294097311794758, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4496937990188599, "eval_python_code_alpaca_n_ngrams_match_1": 10.265625, "eval_python_code_alpaca_n_ngrams_match_2": 5.7109375, "eval_python_code_alpaca_n_ngrams_match_3": 3.1953125, "eval_python_code_alpaca_num_pred_words": 18.015625, "eval_python_code_alpaca_num_true_words": 18.421875, "eval_python_code_alpaca_perplexity": 4.261809345153927, "eval_python_code_alpaca_pred_num_tokens": 24.09375, "eval_python_code_alpaca_rouge_score": 0.590375249995494, "eval_python_code_alpaca_runtime": 29.9266, "eval_python_code_alpaca_samples_per_second": 16.708, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.607452588873998, "eval_python_code_alpaca_token_set_f1_sem": 0.014181879497003117, "eval_python_code_alpaca_token_set_precision": 0.598424849242641, "eval_python_code_alpaca_token_set_recall": 0.6221547967982944, "eval_python_code_alpaca_true_num_tokens": 24.234375, "step": 575000 }, { "epoch": 33.63163128034158, "eval_wikibio_accuracy": 0.364109375, "eval_wikibio_bleu_score": 7.086220734681513, "eval_wikibio_bleu_score_sem": 0.6671146513862606, "eval_wikibio_emb_cos_sim": 0.5877400636672974, "eval_wikibio_emb_cos_sim_sem": 0.02306285873055458, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.697312116622925, "eval_wikibio_n_ngrams_match_1": 15.578125, "eval_wikibio_n_ngrams_match_2": 5.140625, "eval_wikibio_n_ngrams_match_3": 2.0, "eval_wikibio_num_pred_words": 53.46875, "eval_wikibio_num_true_words": 53.171875, "eval_wikibio_perplexity": 14.83979044411355, "eval_wikibio_pred_num_tokens": 109.8515625, "eval_wikibio_rouge_score": 0.29547111135426674, "eval_wikibio_runtime": 31.3252, "eval_wikibio_samples_per_second": 15.962, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.319669258600013, "eval_wikibio_token_set_f1_sem": 0.01174260380951484, "eval_wikibio_token_set_precision": 0.2859588259258, "eval_wikibio_token_set_recall": 0.39982062363401794, "eval_wikibio_true_num_tokens": 101.84375, "step": 575000 }, { "epoch": 33.63163128034158, "eval_msmarco_accuracy": 0.399140625, "eval_msmarco_bleu_score": 17.622423286517208, "eval_msmarco_bleu_score_sem": 1.423555779173352, "eval_msmarco_emb_cos_sim": 0.7617058753967285, "eval_msmarco_emb_cos_sim_sem": 0.01778782159090042, "eval_msmarco_emb_top1_equal": 0.9140625, "eval_msmarco_emb_top1_equal_sem": 0.024870097637176514, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7442445755004883, "eval_msmarco_n_ngrams_match_1": 28.359375, "eval_msmarco_n_ngrams_match_2": 13.1484375, "eval_msmarco_n_ngrams_match_3": 7.671875, "eval_msmarco_num_pred_words": 61.65625, "eval_msmarco_num_true_words": 61.7421875, "eval_msmarco_perplexity": 5.721577622542804, "eval_msmarco_pred_num_tokens": 84.328125, "eval_msmarco_rouge_score": 0.44608588809398264, "eval_msmarco_runtime": 26.0269, "eval_msmarco_samples_per_second": 19.211, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.47068818766819576, "eval_msmarco_token_set_f1_sem": 0.013889156141803877, "eval_msmarco_token_set_precision": 0.4367243710238831, "eval_msmarco_token_set_recall": 0.5326388670970792, "eval_msmarco_true_num_tokens": 79.671875, "step": 575000 }, { "epoch": 33.637480259694684, "grad_norm": 0.2649606466293335, "learning_rate": 0.001, "loss": 1.8601, "step": 575100 }, { "epoch": 33.64332923904779, "grad_norm": 0.23689435422420502, "learning_rate": 0.001, "loss": 1.8552, "step": 575200 }, { "epoch": 33.64917821840089, "grad_norm": 0.2855735123157501, "learning_rate": 0.001, "loss": 1.8543, "step": 575300 }, { "epoch": 33.655027197753995, "grad_norm": 0.2896367013454437, "learning_rate": 0.001, "loss": 1.8605, "step": 575400 }, { "epoch": 33.66087617710709, "grad_norm": 0.1727568507194519, "learning_rate": 0.001, "loss": 1.8526, "step": 575500 }, { "epoch": 33.666725156460195, "grad_norm": 0.2508307993412018, "learning_rate": 0.001, "loss": 1.8557, "step": 575600 }, { "epoch": 33.6725741358133, "grad_norm": 0.19307416677474976, "learning_rate": 0.001, "loss": 1.863, "step": 575700 }, { "epoch": 33.6784231151664, "grad_norm": 0.2710000276565552, "learning_rate": 0.001, "loss": 1.8553, "step": 575800 }, { "epoch": 33.68427209451951, "grad_norm": 0.2305716574192047, "learning_rate": 0.001, "loss": 1.8595, "step": 575900 }, { "epoch": 33.69012107387261, "grad_norm": 0.2159649133682251, "learning_rate": 0.001, "loss": 1.8536, "step": 576000 }, { "epoch": 33.695970053225714, "grad_norm": 0.20617689192295074, "learning_rate": 0.001, "loss": 1.8575, "step": 576100 }, { "epoch": 33.70181903257882, "grad_norm": 0.19498230516910553, "learning_rate": 0.001, "loss": 1.8609, "step": 576200 }, { "epoch": 33.707668011931915, "grad_norm": 0.21882838010787964, "learning_rate": 0.001, "loss": 1.8618, "step": 576300 }, { "epoch": 33.71351699128502, "grad_norm": 0.21038943529129028, "learning_rate": 0.001, "loss": 1.8539, "step": 576400 }, { "epoch": 33.71936597063812, "grad_norm": 0.22146011888980865, "learning_rate": 0.001, "loss": 1.8582, "step": 576500 }, { "epoch": 33.725214949991226, "grad_norm": 0.18432819843292236, "learning_rate": 0.001, "loss": 1.8569, "step": 576600 }, { "epoch": 33.73106392934433, "grad_norm": 0.23847562074661255, "learning_rate": 0.001, "loss": 1.8583, "step": 576700 }, { "epoch": 33.736912908697434, "grad_norm": 0.219040647149086, "learning_rate": 0.001, "loss": 1.8606, "step": 576800 }, { "epoch": 33.74276188805054, "grad_norm": 0.20696909725666046, "learning_rate": 0.001, "loss": 1.8523, "step": 576900 }, { "epoch": 33.74861086740364, "grad_norm": 0.19245393574237823, "learning_rate": 0.001, "loss": 1.8625, "step": 577000 }, { "epoch": 33.75445984675674, "grad_norm": 0.24356701970100403, "learning_rate": 0.001, "loss": 1.8563, "step": 577100 }, { "epoch": 33.76030882610984, "grad_norm": 0.2624721825122833, "learning_rate": 0.001, "loss": 1.8607, "step": 577200 }, { "epoch": 33.766157805462946, "grad_norm": 0.2634015679359436, "learning_rate": 0.001, "loss": 1.8562, "step": 577300 }, { "epoch": 33.77200678481605, "grad_norm": 0.22823834419250488, "learning_rate": 0.001, "loss": 1.8553, "step": 577400 }, { "epoch": 33.77785576416915, "grad_norm": 0.22636768221855164, "learning_rate": 0.001, "loss": 1.8595, "step": 577500 }, { "epoch": 33.78370474352226, "grad_norm": 0.19021908938884735, "learning_rate": 0.001, "loss": 1.8605, "step": 577600 }, { "epoch": 33.78955372287536, "grad_norm": 0.2516225576400757, "learning_rate": 0.001, "loss": 1.8517, "step": 577700 }, { "epoch": 33.79540270222846, "grad_norm": 0.2077825665473938, "learning_rate": 0.001, "loss": 1.8573, "step": 577800 }, { "epoch": 33.80125168158156, "grad_norm": 0.24747803807258606, "learning_rate": 0.001, "loss": 1.8588, "step": 577900 }, { "epoch": 33.807100660934665, "grad_norm": 0.23843704164028168, "learning_rate": 0.001, "loss": 1.8535, "step": 578000 }, { "epoch": 33.81294964028777, "grad_norm": 0.21169476211071014, "learning_rate": 0.001, "loss": 1.8608, "step": 578100 }, { "epoch": 33.81879861964087, "grad_norm": 0.21296581625938416, "learning_rate": 0.001, "loss": 1.8576, "step": 578200 }, { "epoch": 33.82464759899398, "grad_norm": 0.1904950588941574, "learning_rate": 0.001, "loss": 1.8577, "step": 578300 }, { "epoch": 33.83049657834708, "grad_norm": 0.23081457614898682, "learning_rate": 0.001, "loss": 1.8567, "step": 578400 }, { "epoch": 33.836345557700184, "grad_norm": 0.2229587584733963, "learning_rate": 0.001, "loss": 1.858, "step": 578500 }, { "epoch": 33.84219453705328, "grad_norm": 0.22094973921775818, "learning_rate": 0.001, "loss": 1.8645, "step": 578600 }, { "epoch": 33.848043516406385, "grad_norm": 0.1755305528640747, "learning_rate": 0.001, "loss": 1.8545, "step": 578700 }, { "epoch": 33.85389249575949, "grad_norm": 0.1849759966135025, "learning_rate": 0.001, "loss": 1.8585, "step": 578800 }, { "epoch": 33.85974147511259, "grad_norm": 0.2431768923997879, "learning_rate": 0.001, "loss": 1.8576, "step": 578900 }, { "epoch": 33.865590454465696, "grad_norm": 0.22523027658462524, "learning_rate": 0.001, "loss": 1.8585, "step": 579000 }, { "epoch": 33.8714394338188, "grad_norm": 0.19430510699748993, "learning_rate": 0.001, "loss": 1.8585, "step": 579100 }, { "epoch": 33.877288413171904, "grad_norm": 0.22060424089431763, "learning_rate": 0.001, "loss": 1.8604, "step": 579200 }, { "epoch": 33.88313739252501, "grad_norm": 0.18792852759361267, "learning_rate": 0.001, "loss": 1.857, "step": 579300 }, { "epoch": 33.888986371878104, "grad_norm": 0.18818704783916473, "learning_rate": 0.001, "loss": 1.8616, "step": 579400 }, { "epoch": 33.89483535123121, "grad_norm": 0.22483007609844208, "learning_rate": 0.001, "loss": 1.8572, "step": 579500 }, { "epoch": 33.90068433058431, "grad_norm": 0.17692433297634125, "learning_rate": 0.001, "loss": 1.856, "step": 579600 }, { "epoch": 33.906533309937416, "grad_norm": 0.19047535955905914, "learning_rate": 0.001, "loss": 1.8589, "step": 579700 }, { "epoch": 33.91238228929052, "grad_norm": 0.21845689415931702, "learning_rate": 0.001, "loss": 1.8651, "step": 579800 }, { "epoch": 33.91823126864362, "grad_norm": 0.19589610397815704, "learning_rate": 0.001, "loss": 1.8642, "step": 579900 }, { "epoch": 33.92408024799673, "grad_norm": 0.3192707896232605, "learning_rate": 0.001, "loss": 1.8632, "step": 580000 }, { "epoch": 33.92992922734983, "grad_norm": 0.22846996784210205, "learning_rate": 0.001, "loss": 1.8565, "step": 580100 }, { "epoch": 33.93577820670293, "grad_norm": 0.21548213064670563, "learning_rate": 0.001, "loss": 1.8552, "step": 580200 }, { "epoch": 33.94162718605603, "grad_norm": 0.17857420444488525, "learning_rate": 0.001, "loss": 1.8606, "step": 580300 }, { "epoch": 33.947476165409135, "grad_norm": 0.21043595671653748, "learning_rate": 0.001, "loss": 1.8572, "step": 580400 }, { "epoch": 33.95332514476224, "grad_norm": 0.22599215805530548, "learning_rate": 0.001, "loss": 1.859, "step": 580500 }, { "epoch": 33.95917412411534, "grad_norm": 0.21331281960010529, "learning_rate": 0.001, "loss": 1.8573, "step": 580600 }, { "epoch": 33.96502310346845, "grad_norm": 0.24275052547454834, "learning_rate": 0.001, "loss": 1.8615, "step": 580700 }, { "epoch": 33.97087208282155, "grad_norm": 0.2587900459766388, "learning_rate": 0.001, "loss": 1.8581, "step": 580800 }, { "epoch": 33.97672106217465, "grad_norm": 0.22762402892112732, "learning_rate": 0.001, "loss": 1.8621, "step": 580900 }, { "epoch": 33.98257004152775, "grad_norm": 0.1843102127313614, "learning_rate": 0.001, "loss": 1.8572, "step": 581000 }, { "epoch": 33.988419020880855, "grad_norm": 0.21786798536777496, "learning_rate": 0.001, "loss": 1.8584, "step": 581100 }, { "epoch": 33.99426800023396, "grad_norm": 0.2392406463623047, "learning_rate": 0.001, "loss": 1.8569, "step": 581200 }, { "epoch": 34.00011697958706, "grad_norm": 0.19374622404575348, "learning_rate": 0.001, "loss": 1.8606, "step": 581300 }, { "epoch": 34.005965958940166, "grad_norm": 0.27938202023506165, "learning_rate": 0.001, "loss": 1.8445, "step": 581400 }, { "epoch": 34.01181493829327, "grad_norm": 0.17860637605190277, "learning_rate": 0.001, "loss": 1.8432, "step": 581500 }, { "epoch": 34.017663917646374, "grad_norm": 0.18463172018527985, "learning_rate": 0.001, "loss": 1.8508, "step": 581600 }, { "epoch": 34.02351289699947, "grad_norm": 0.14947609603405, "learning_rate": 0.001, "loss": 1.8474, "step": 581700 }, { "epoch": 34.029361876352574, "grad_norm": 0.17362137138843536, "learning_rate": 0.001, "loss": 1.8428, "step": 581800 }, { "epoch": 34.03521085570568, "grad_norm": 0.20848751068115234, "learning_rate": 0.001, "loss": 1.8425, "step": 581900 }, { "epoch": 34.04105983505878, "grad_norm": 0.18588761985301971, "learning_rate": 0.001, "loss": 1.8417, "step": 582000 }, { "epoch": 34.046908814411886, "grad_norm": 0.20698322355747223, "learning_rate": 0.001, "loss": 1.8422, "step": 582100 }, { "epoch": 34.05275779376499, "grad_norm": 0.16960348188877106, "learning_rate": 0.001, "loss": 1.8513, "step": 582200 }, { "epoch": 34.05860677311809, "grad_norm": 0.2233525663614273, "learning_rate": 0.001, "loss": 1.8495, "step": 582300 }, { "epoch": 34.0644557524712, "grad_norm": 0.1569087952375412, "learning_rate": 0.001, "loss": 1.8456, "step": 582400 }, { "epoch": 34.070304731824294, "grad_norm": 0.15586644411087036, "learning_rate": 0.001, "loss": 1.849, "step": 582500 }, { "epoch": 34.0761537111774, "grad_norm": 0.18458589911460876, "learning_rate": 0.001, "loss": 1.8459, "step": 582600 }, { "epoch": 34.0820026905305, "grad_norm": 0.27459466457366943, "learning_rate": 0.001, "loss": 1.8456, "step": 582700 }, { "epoch": 34.087851669883605, "grad_norm": 0.16267381608486176, "learning_rate": 0.001, "loss": 1.8495, "step": 582800 }, { "epoch": 34.09370064923671, "grad_norm": 0.1434936374425888, "learning_rate": 0.001, "loss": 1.8469, "step": 582900 }, { "epoch": 34.09954962858981, "grad_norm": 0.2466229349374771, "learning_rate": 0.001, "loss": 1.8435, "step": 583000 }, { "epoch": 34.10539860794292, "grad_norm": 0.22507846355438232, "learning_rate": 0.001, "loss": 1.851, "step": 583100 }, { "epoch": 34.11124758729602, "grad_norm": 0.219479039311409, "learning_rate": 0.001, "loss": 1.8505, "step": 583200 }, { "epoch": 34.11709656664912, "grad_norm": 0.22430512309074402, "learning_rate": 0.001, "loss": 1.8491, "step": 583300 }, { "epoch": 34.12294554600222, "grad_norm": 0.1686488389968872, "learning_rate": 0.001, "loss": 1.8454, "step": 583400 }, { "epoch": 34.128794525355325, "grad_norm": 0.22738441824913025, "learning_rate": 0.001, "loss": 1.844, "step": 583500 }, { "epoch": 34.13464350470843, "grad_norm": 0.2397090494632721, "learning_rate": 0.001, "loss": 1.8503, "step": 583600 }, { "epoch": 34.14049248406153, "grad_norm": 0.15349771082401276, "learning_rate": 0.001, "loss": 1.8487, "step": 583700 }, { "epoch": 34.146341463414636, "grad_norm": 0.22177039086818695, "learning_rate": 0.001, "loss": 1.8501, "step": 583800 }, { "epoch": 34.15219044276774, "grad_norm": 0.229667067527771, "learning_rate": 0.001, "loss": 1.8508, "step": 583900 }, { "epoch": 34.15803942212084, "grad_norm": 0.23200640082359314, "learning_rate": 0.001, "loss": 1.8467, "step": 584000 }, { "epoch": 34.16388840147394, "grad_norm": 0.18450039625167847, "learning_rate": 0.001, "loss": 1.8543, "step": 584100 }, { "epoch": 34.169737380827044, "grad_norm": 0.1808883547782898, "learning_rate": 0.001, "loss": 1.8487, "step": 584200 }, { "epoch": 34.17558636018015, "grad_norm": 0.14474141597747803, "learning_rate": 0.001, "loss": 1.848, "step": 584300 }, { "epoch": 34.18143533953325, "grad_norm": 0.18320412933826447, "learning_rate": 0.001, "loss": 1.8501, "step": 584400 }, { "epoch": 34.187284318886356, "grad_norm": 0.19043271243572235, "learning_rate": 0.001, "loss": 1.8509, "step": 584500 }, { "epoch": 34.19313329823946, "grad_norm": 0.3151434063911438, "learning_rate": 0.001, "loss": 1.8478, "step": 584600 }, { "epoch": 34.19898227759256, "grad_norm": 0.19157551229000092, "learning_rate": 0.001, "loss": 1.8508, "step": 584700 }, { "epoch": 34.20483125694566, "grad_norm": 0.13461317121982574, "learning_rate": 0.001, "loss": 1.8526, "step": 584800 }, { "epoch": 34.210680236298764, "grad_norm": 0.16955873370170593, "learning_rate": 0.001, "loss": 1.8491, "step": 584900 }, { "epoch": 34.21652921565187, "grad_norm": 0.2028988003730774, "learning_rate": 0.001, "loss": 1.849, "step": 585000 }, { "epoch": 34.22237819500497, "grad_norm": 0.15953154861927032, "learning_rate": 0.001, "loss": 1.8517, "step": 585100 }, { "epoch": 34.228227174358075, "grad_norm": 0.2315700352191925, "learning_rate": 0.001, "loss": 1.852, "step": 585200 }, { "epoch": 34.23407615371118, "grad_norm": 0.219327911734581, "learning_rate": 0.001, "loss": 1.855, "step": 585300 }, { "epoch": 34.23992513306428, "grad_norm": 0.1429232507944107, "learning_rate": 0.001, "loss": 1.8476, "step": 585400 }, { "epoch": 34.24577411241739, "grad_norm": 0.1554519385099411, "learning_rate": 0.001, "loss": 1.8489, "step": 585500 }, { "epoch": 34.25162309177048, "grad_norm": 0.21045970916748047, "learning_rate": 0.001, "loss": 1.8546, "step": 585600 }, { "epoch": 34.25747207112359, "grad_norm": 0.19420811533927917, "learning_rate": 0.001, "loss": 1.8516, "step": 585700 }, { "epoch": 34.26332105047669, "grad_norm": 0.18573793768882751, "learning_rate": 0.001, "loss": 1.8543, "step": 585800 }, { "epoch": 34.269170029829795, "grad_norm": 0.21049565076828003, "learning_rate": 0.001, "loss": 1.8456, "step": 585900 }, { "epoch": 34.2750190091829, "grad_norm": 0.1816740185022354, "learning_rate": 0.001, "loss": 1.8499, "step": 586000 }, { "epoch": 34.280867988536, "grad_norm": 0.20337443053722382, "learning_rate": 0.001, "loss": 1.8489, "step": 586100 }, { "epoch": 34.286716967889106, "grad_norm": 0.23432877659797668, "learning_rate": 0.001, "loss": 1.8536, "step": 586200 }, { "epoch": 34.29256594724221, "grad_norm": 0.2715132534503937, "learning_rate": 0.001, "loss": 1.8497, "step": 586300 }, { "epoch": 34.298414926595306, "grad_norm": 0.20230835676193237, "learning_rate": 0.001, "loss": 1.85, "step": 586400 }, { "epoch": 34.30426390594841, "grad_norm": 0.19051159918308258, "learning_rate": 0.001, "loss": 1.8533, "step": 586500 }, { "epoch": 34.310112885301514, "grad_norm": 0.21481600403785706, "learning_rate": 0.001, "loss": 1.8476, "step": 586600 }, { "epoch": 34.31596186465462, "grad_norm": 0.2806928753852844, "learning_rate": 0.001, "loss": 1.8466, "step": 586700 }, { "epoch": 34.32181084400772, "grad_norm": 0.2782362699508667, "learning_rate": 0.001, "loss": 1.8493, "step": 586800 }, { "epoch": 34.327659823360825, "grad_norm": 0.3407117426395416, "learning_rate": 0.001, "loss": 1.8498, "step": 586900 }, { "epoch": 34.33350880271393, "grad_norm": 0.21679645776748657, "learning_rate": 0.001, "loss": 1.8522, "step": 587000 }, { "epoch": 34.339357782067026, "grad_norm": 0.22526408731937408, "learning_rate": 0.001, "loss": 1.8513, "step": 587100 }, { "epoch": 34.34520676142013, "grad_norm": 0.26065394282341003, "learning_rate": 0.001, "loss": 1.8526, "step": 587200 }, { "epoch": 34.35105574077323, "grad_norm": 0.23885080218315125, "learning_rate": 0.001, "loss": 1.8482, "step": 587300 }, { "epoch": 34.35690472012634, "grad_norm": 0.20560504496097565, "learning_rate": 0.001, "loss": 1.8539, "step": 587400 }, { "epoch": 34.36275369947944, "grad_norm": 0.2003147006034851, "learning_rate": 0.001, "loss": 1.861, "step": 587500 }, { "epoch": 34.368602678832545, "grad_norm": 0.18893903493881226, "learning_rate": 0.001, "loss": 1.8548, "step": 587600 }, { "epoch": 34.37445165818565, "grad_norm": 0.2108544260263443, "learning_rate": 0.001, "loss": 1.8525, "step": 587700 }, { "epoch": 34.38030063753875, "grad_norm": 0.16949322819709778, "learning_rate": 0.001, "loss": 1.8521, "step": 587800 }, { "epoch": 34.38614961689185, "grad_norm": 0.19590216875076294, "learning_rate": 0.001, "loss": 1.8511, "step": 587900 }, { "epoch": 34.39199859624495, "grad_norm": 0.16392359137535095, "learning_rate": 0.001, "loss": 1.8542, "step": 588000 }, { "epoch": 34.39784757559806, "grad_norm": 0.20218035578727722, "learning_rate": 0.001, "loss": 1.8514, "step": 588100 }, { "epoch": 34.40369655495116, "grad_norm": 0.30763334035873413, "learning_rate": 0.001, "loss": 1.8498, "step": 588200 }, { "epoch": 34.409545534304264, "grad_norm": 0.22789357602596283, "learning_rate": 0.001, "loss": 1.8545, "step": 588300 }, { "epoch": 34.41539451365737, "grad_norm": 0.1901976317167282, "learning_rate": 0.001, "loss": 1.8519, "step": 588400 }, { "epoch": 34.42124349301047, "grad_norm": 0.23197825253009796, "learning_rate": 0.001, "loss": 1.8576, "step": 588500 }, { "epoch": 34.427092472363576, "grad_norm": 0.25163087248802185, "learning_rate": 0.001, "loss": 1.8497, "step": 588600 }, { "epoch": 34.43294145171667, "grad_norm": 0.17113028466701508, "learning_rate": 0.001, "loss": 1.8491, "step": 588700 }, { "epoch": 34.438790431069776, "grad_norm": 0.21102511882781982, "learning_rate": 0.001, "loss": 1.8528, "step": 588800 }, { "epoch": 34.44463941042288, "grad_norm": 0.20598407089710236, "learning_rate": 0.001, "loss": 1.8497, "step": 588900 }, { "epoch": 34.450488389775984, "grad_norm": 0.21358059346675873, "learning_rate": 0.001, "loss": 1.8498, "step": 589000 }, { "epoch": 34.45633736912909, "grad_norm": 0.2188253551721573, "learning_rate": 0.001, "loss": 1.8527, "step": 589100 }, { "epoch": 34.46218634848219, "grad_norm": 0.2103394865989685, "learning_rate": 0.001, "loss": 1.8569, "step": 589200 }, { "epoch": 34.468035327835295, "grad_norm": 0.16296246647834778, "learning_rate": 0.001, "loss": 1.8498, "step": 589300 }, { "epoch": 34.4738843071884, "grad_norm": 0.21042704582214355, "learning_rate": 0.001, "loss": 1.8543, "step": 589400 }, { "epoch": 34.479733286541496, "grad_norm": 0.2671123743057251, "learning_rate": 0.001, "loss": 1.8554, "step": 589500 }, { "epoch": 34.4855822658946, "grad_norm": 0.24031728506088257, "learning_rate": 0.001, "loss": 1.8479, "step": 589600 }, { "epoch": 34.4914312452477, "grad_norm": 0.2572202682495117, "learning_rate": 0.001, "loss": 1.8525, "step": 589700 }, { "epoch": 34.49728022460081, "grad_norm": 0.21343320608139038, "learning_rate": 0.001, "loss": 1.8522, "step": 589800 }, { "epoch": 34.50312920395391, "grad_norm": 0.19157648086547852, "learning_rate": 0.001, "loss": 1.8481, "step": 589900 }, { "epoch": 34.508978183307015, "grad_norm": 0.22265733778476715, "learning_rate": 0.001, "loss": 1.8576, "step": 590000 }, { "epoch": 34.51482716266012, "grad_norm": 0.20285578072071075, "learning_rate": 0.001, "loss": 1.8559, "step": 590100 }, { "epoch": 34.520676142013215, "grad_norm": 0.2504899203777313, "learning_rate": 0.001, "loss": 1.8589, "step": 590200 }, { "epoch": 34.52652512136632, "grad_norm": 0.15506136417388916, "learning_rate": 0.001, "loss": 1.8467, "step": 590300 }, { "epoch": 34.53237410071942, "grad_norm": 0.21767932176589966, "learning_rate": 0.001, "loss": 1.8568, "step": 590400 }, { "epoch": 34.53822308007253, "grad_norm": 0.26797839999198914, "learning_rate": 0.001, "loss": 1.8504, "step": 590500 }, { "epoch": 34.54407205942563, "grad_norm": 0.20184031128883362, "learning_rate": 0.001, "loss": 1.8507, "step": 590600 }, { "epoch": 34.549921038778734, "grad_norm": 0.19419291615486145, "learning_rate": 0.001, "loss": 1.8503, "step": 590700 }, { "epoch": 34.55577001813184, "grad_norm": 0.19033418595790863, "learning_rate": 0.001, "loss": 1.8544, "step": 590800 }, { "epoch": 34.56161899748494, "grad_norm": 0.22431153059005737, "learning_rate": 0.001, "loss": 1.8557, "step": 590900 }, { "epoch": 34.56746797683804, "grad_norm": 0.27581852674484253, "learning_rate": 0.001, "loss": 1.8496, "step": 591000 }, { "epoch": 34.57331695619114, "grad_norm": 0.24048100411891937, "learning_rate": 0.001, "loss": 1.858, "step": 591100 }, { "epoch": 34.579165935544246, "grad_norm": 0.23520135879516602, "learning_rate": 0.001, "loss": 1.854, "step": 591200 }, { "epoch": 34.58501491489735, "grad_norm": 0.24321851134300232, "learning_rate": 0.001, "loss": 1.8508, "step": 591300 }, { "epoch": 34.590863894250454, "grad_norm": 0.14512953162193298, "learning_rate": 0.001, "loss": 1.8547, "step": 591400 }, { "epoch": 34.59671287360356, "grad_norm": 0.1592622846364975, "learning_rate": 0.001, "loss": 1.8564, "step": 591500 }, { "epoch": 34.60256185295666, "grad_norm": 0.171926811337471, "learning_rate": 0.001, "loss": 1.8506, "step": 591600 }, { "epoch": 34.608410832309765, "grad_norm": 0.1813744604587555, "learning_rate": 0.001, "loss": 1.8526, "step": 591700 }, { "epoch": 34.61425981166286, "grad_norm": 0.2617669999599457, "learning_rate": 0.001, "loss": 1.8545, "step": 591800 }, { "epoch": 34.620108791015966, "grad_norm": 0.18243277072906494, "learning_rate": 0.001, "loss": 1.8504, "step": 591900 }, { "epoch": 34.62595777036907, "grad_norm": 0.13554918766021729, "learning_rate": 0.001, "loss": 1.8567, "step": 592000 }, { "epoch": 34.63180674972217, "grad_norm": 0.15053609013557434, "learning_rate": 0.001, "loss": 1.8519, "step": 592100 }, { "epoch": 34.63765572907528, "grad_norm": 0.22915469110012054, "learning_rate": 0.001, "loss": 1.8517, "step": 592200 }, { "epoch": 34.64350470842838, "grad_norm": 0.35194727778434753, "learning_rate": 0.001, "loss": 1.8572, "step": 592300 }, { "epoch": 34.649353687781485, "grad_norm": 0.1717011034488678, "learning_rate": 0.001, "loss": 1.8531, "step": 592400 }, { "epoch": 34.65520266713459, "grad_norm": 0.23497594892978668, "learning_rate": 0.001, "loss": 1.8554, "step": 592500 }, { "epoch": 34.661051646487685, "grad_norm": 0.22433093190193176, "learning_rate": 0.001, "loss": 1.8656, "step": 592600 }, { "epoch": 34.66690062584079, "grad_norm": 0.15109847486019135, "learning_rate": 0.001, "loss": 1.8534, "step": 592700 }, { "epoch": 34.67274960519389, "grad_norm": 0.19713068008422852, "learning_rate": 0.001, "loss": 1.8489, "step": 592800 }, { "epoch": 34.678598584547, "grad_norm": 0.24141116440296173, "learning_rate": 0.001, "loss": 1.8547, "step": 592900 }, { "epoch": 34.6844475639001, "grad_norm": 0.17301033437252045, "learning_rate": 0.001, "loss": 1.8505, "step": 593000 }, { "epoch": 34.690296543253204, "grad_norm": 0.21204213798046112, "learning_rate": 0.001, "loss": 1.8555, "step": 593100 }, { "epoch": 34.69614552260631, "grad_norm": 0.18937933444976807, "learning_rate": 0.001, "loss": 1.8537, "step": 593200 }, { "epoch": 34.701994501959405, "grad_norm": 0.26210817694664, "learning_rate": 0.001, "loss": 1.8528, "step": 593300 }, { "epoch": 34.70784348131251, "grad_norm": 0.16817989945411682, "learning_rate": 0.001, "loss": 1.853, "step": 593400 }, { "epoch": 34.71369246066561, "grad_norm": 0.14607612788677216, "learning_rate": 0.001, "loss": 1.8512, "step": 593500 }, { "epoch": 34.719541440018716, "grad_norm": 0.20552030205726624, "learning_rate": 0.001, "loss": 1.852, "step": 593600 }, { "epoch": 34.72539041937182, "grad_norm": 0.1962968409061432, "learning_rate": 0.001, "loss": 1.8557, "step": 593700 }, { "epoch": 34.731239398724924, "grad_norm": 0.2518377900123596, "learning_rate": 0.001, "loss": 1.8574, "step": 593800 }, { "epoch": 34.73708837807803, "grad_norm": 0.17807351052761078, "learning_rate": 0.001, "loss": 1.8563, "step": 593900 }, { "epoch": 34.74293735743113, "grad_norm": 0.21547721326351166, "learning_rate": 0.001, "loss": 1.8501, "step": 594000 }, { "epoch": 34.74878633678423, "grad_norm": 0.1826431304216385, "learning_rate": 0.001, "loss": 1.8516, "step": 594100 }, { "epoch": 34.75463531613733, "grad_norm": 0.19753263890743256, "learning_rate": 0.001, "loss": 1.8564, "step": 594200 }, { "epoch": 34.760484295490436, "grad_norm": 0.22295239567756653, "learning_rate": 0.001, "loss": 1.8584, "step": 594300 }, { "epoch": 34.76633327484354, "grad_norm": 0.20095930993556976, "learning_rate": 0.001, "loss": 1.857, "step": 594400 }, { "epoch": 34.77218225419664, "grad_norm": 0.33793067932128906, "learning_rate": 0.001, "loss": 1.8592, "step": 594500 }, { "epoch": 34.77803123354975, "grad_norm": 0.16409562528133392, "learning_rate": 0.001, "loss": 1.8573, "step": 594600 }, { "epoch": 34.78388021290285, "grad_norm": 0.22729477286338806, "learning_rate": 0.001, "loss": 1.8599, "step": 594700 }, { "epoch": 34.789729192255955, "grad_norm": 0.22657926380634308, "learning_rate": 0.001, "loss": 1.8547, "step": 594800 }, { "epoch": 34.79557817160905, "grad_norm": 0.17451901733875275, "learning_rate": 0.001, "loss": 1.8495, "step": 594900 }, { "epoch": 34.801427150962155, "grad_norm": 0.19040130078792572, "learning_rate": 0.001, "loss": 1.8563, "step": 595000 }, { "epoch": 34.80727613031526, "grad_norm": 0.14184553921222687, "learning_rate": 0.001, "loss": 1.8534, "step": 595100 }, { "epoch": 34.81312510966836, "grad_norm": 0.15872107446193695, "learning_rate": 0.001, "loss": 1.8539, "step": 595200 }, { "epoch": 34.81897408902147, "grad_norm": 0.18478836119174957, "learning_rate": 0.001, "loss": 1.8533, "step": 595300 }, { "epoch": 34.82482306837457, "grad_norm": 0.20778079330921173, "learning_rate": 0.001, "loss": 1.8597, "step": 595400 }, { "epoch": 34.830672047727674, "grad_norm": 0.24218390882015228, "learning_rate": 0.001, "loss": 1.8541, "step": 595500 }, { "epoch": 34.83652102708078, "grad_norm": 0.19619286060333252, "learning_rate": 0.001, "loss": 1.855, "step": 595600 }, { "epoch": 34.842370006433875, "grad_norm": 0.16342976689338684, "learning_rate": 0.001, "loss": 1.856, "step": 595700 }, { "epoch": 34.84821898578698, "grad_norm": 0.26292121410369873, "learning_rate": 0.001, "loss": 1.8552, "step": 595800 }, { "epoch": 34.85406796514008, "grad_norm": 0.20229244232177734, "learning_rate": 0.001, "loss": 1.8599, "step": 595900 }, { "epoch": 34.859916944493186, "grad_norm": 0.18576617538928986, "learning_rate": 0.001, "loss": 1.8542, "step": 596000 }, { "epoch": 34.86576592384629, "grad_norm": 0.18218328058719635, "learning_rate": 0.001, "loss": 1.856, "step": 596100 }, { "epoch": 34.871614903199394, "grad_norm": 0.26774927973747253, "learning_rate": 0.001, "loss": 1.8535, "step": 596200 }, { "epoch": 34.8774638825525, "grad_norm": 0.28207260370254517, "learning_rate": 0.001, "loss": 1.8589, "step": 596300 }, { "epoch": 34.883312861905594, "grad_norm": 0.19640688598155975, "learning_rate": 0.001, "loss": 1.8515, "step": 596400 }, { "epoch": 34.8891618412587, "grad_norm": 0.1602420061826706, "learning_rate": 0.001, "loss": 1.851, "step": 596500 }, { "epoch": 34.8950108206118, "grad_norm": 0.16155710816383362, "learning_rate": 0.001, "loss": 1.8537, "step": 596600 }, { "epoch": 34.900859799964906, "grad_norm": 0.19232484698295593, "learning_rate": 0.001, "loss": 1.8584, "step": 596700 }, { "epoch": 34.90670877931801, "grad_norm": 0.1763300746679306, "learning_rate": 0.001, "loss": 1.8618, "step": 596800 }, { "epoch": 34.91255775867111, "grad_norm": 0.2288006842136383, "learning_rate": 0.001, "loss": 1.8568, "step": 596900 }, { "epoch": 34.91840673802422, "grad_norm": 0.18533168733119965, "learning_rate": 0.001, "loss": 1.8553, "step": 597000 }, { "epoch": 34.92425571737732, "grad_norm": 0.1671360731124878, "learning_rate": 0.001, "loss": 1.8573, "step": 597100 }, { "epoch": 34.93010469673042, "grad_norm": 0.18000052869319916, "learning_rate": 0.001, "loss": 1.8501, "step": 597200 }, { "epoch": 34.93595367608352, "grad_norm": 0.2478480339050293, "learning_rate": 0.001, "loss": 1.8599, "step": 597300 }, { "epoch": 34.941802655436625, "grad_norm": 0.22613365948200226, "learning_rate": 0.001, "loss": 1.8577, "step": 597400 }, { "epoch": 34.94765163478973, "grad_norm": 0.1420476734638214, "learning_rate": 0.001, "loss": 1.8532, "step": 597500 }, { "epoch": 34.95350061414283, "grad_norm": 0.18145638704299927, "learning_rate": 0.001, "loss": 1.8576, "step": 597600 }, { "epoch": 34.959349593495936, "grad_norm": 0.2068995237350464, "learning_rate": 0.001, "loss": 1.8562, "step": 597700 }, { "epoch": 34.96519857284904, "grad_norm": 0.17516756057739258, "learning_rate": 0.001, "loss": 1.851, "step": 597800 }, { "epoch": 34.971047552202144, "grad_norm": 0.2265363186597824, "learning_rate": 0.001, "loss": 1.8566, "step": 597900 }, { "epoch": 34.97689653155524, "grad_norm": 0.19621910154819489, "learning_rate": 0.001, "loss": 1.8589, "step": 598000 }, { "epoch": 34.982745510908344, "grad_norm": 0.2149301916360855, "learning_rate": 0.001, "loss": 1.8541, "step": 598100 }, { "epoch": 34.98859449026145, "grad_norm": 0.26036322116851807, "learning_rate": 0.001, "loss": 1.8587, "step": 598200 }, { "epoch": 34.99444346961455, "grad_norm": 0.17267736792564392, "learning_rate": 0.001, "loss": 1.8511, "step": 598300 }, { "epoch": 35.000292448967656, "grad_norm": 0.20088237524032593, "learning_rate": 0.001, "loss": 1.8615, "step": 598400 }, { "epoch": 35.00614142832076, "grad_norm": 0.17136314511299133, "learning_rate": 0.001, "loss": 1.8377, "step": 598500 }, { "epoch": 35.01199040767386, "grad_norm": 0.18102091550827026, "learning_rate": 0.001, "loss": 1.837, "step": 598600 }, { "epoch": 35.01783938702697, "grad_norm": 0.3100382685661316, "learning_rate": 0.001, "loss": 1.8364, "step": 598700 }, { "epoch": 35.023688366380064, "grad_norm": 0.1976974457502365, "learning_rate": 0.001, "loss": 1.8416, "step": 598800 }, { "epoch": 35.02953734573317, "grad_norm": 0.20943281054496765, "learning_rate": 0.001, "loss": 1.8444, "step": 598900 }, { "epoch": 35.03538632508627, "grad_norm": 0.2529895305633545, "learning_rate": 0.001, "loss": 1.8457, "step": 599000 }, { "epoch": 35.041235304439375, "grad_norm": 0.17161478102207184, "learning_rate": 0.001, "loss": 1.8438, "step": 599100 }, { "epoch": 35.04708428379248, "grad_norm": 0.18232542276382446, "learning_rate": 0.001, "loss": 1.8451, "step": 599200 }, { "epoch": 35.05293326314558, "grad_norm": 0.22038592398166656, "learning_rate": 0.001, "loss": 1.8406, "step": 599300 }, { "epoch": 35.05878224249869, "grad_norm": 0.2390139251947403, "learning_rate": 0.001, "loss": 1.8375, "step": 599400 }, { "epoch": 35.06463122185178, "grad_norm": 0.1700025498867035, "learning_rate": 0.001, "loss": 1.845, "step": 599500 }, { "epoch": 35.07048020120489, "grad_norm": 0.17746850848197937, "learning_rate": 0.001, "loss": 1.8389, "step": 599600 }, { "epoch": 35.07632918055799, "grad_norm": 0.18935060501098633, "learning_rate": 0.001, "loss": 1.8421, "step": 599700 }, { "epoch": 35.082178159911095, "grad_norm": 0.1334754228591919, "learning_rate": 0.001, "loss": 1.8409, "step": 599800 }, { "epoch": 35.0880271392642, "grad_norm": 0.1787615567445755, "learning_rate": 0.001, "loss": 1.8408, "step": 599900 }, { "epoch": 35.0938761186173, "grad_norm": 0.23240314424037933, "learning_rate": 0.001, "loss": 1.8482, "step": 600000 }, { "epoch": 35.0938761186173, "eval_ag_news_accuracy": 0.244734375, "eval_ag_news_bleu_score": 7.0857538888991485, "eval_ag_news_bleu_score_sem": 0.48777895704669333, "eval_ag_news_emb_cos_sim": 0.7118459939956665, "eval_ag_news_emb_cos_sim_sem": 0.015290695242583752, "eval_ag_news_emb_top1_equal": 0.9375, "eval_ag_news_emb_top1_equal_sem": 0.02147948183119297, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.713911294937134, "eval_ag_news_n_ngrams_match_1": 14.3515625, "eval_ag_news_n_ngrams_match_2": 4.1328125, "eval_ag_news_n_ngrams_match_3": 1.5625, "eval_ag_news_num_pred_words": 47.1171875, "eval_ag_news_num_true_words": 44.5234375, "eval_ag_news_perplexity": 15.088174554776247, "eval_ag_news_pred_num_tokens": 70.71875, "eval_ag_news_rouge_score": 0.2989253273383677, "eval_ag_news_runtime": 38.0236, "eval_ag_news_samples_per_second": 13.15, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3411353321045864, "eval_ag_news_token_set_f1_sem": 0.009927290877962841, "eval_ag_news_token_set_precision": 0.3287123863281019, "eval_ag_news_token_set_recall": 0.36761740163077244, "eval_ag_news_true_num_tokens": 61.203125, "step": 600000 }, { "epoch": 35.0938761186173, "eval_anthropic_toxic_prompts_accuracy": 0.09959375, "eval_anthropic_toxic_prompts_bleu_score": 46.91357596118261, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.8200355015170513, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.899344801902771, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.01101523544639349, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1796875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03406800775380022, "eval_anthropic_toxic_prompts_loss": 1.260940432548523, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.890625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.6953125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.75, "eval_anthropic_toxic_prompts_num_pred_words": 14.0703125, "eval_anthropic_toxic_prompts_num_true_words": 13.0546875, "eval_anthropic_toxic_prompts_perplexity": 3.528738467933142, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.515625, "eval_anthropic_toxic_prompts_rouge_score": 0.7239625269937069, "eval_anthropic_toxic_prompts_runtime": 31.0905, "eval_anthropic_toxic_prompts_samples_per_second": 16.082, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.7365350890876633, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018947065197895865, "eval_anthropic_toxic_prompts_token_set_precision": 0.7441542748846486, "eval_anthropic_toxic_prompts_token_set_recall": 0.7390348821875724, "eval_anthropic_toxic_prompts_true_num_tokens": 16.0703125, "step": 600000 }, { "epoch": 35.0938761186173, "eval_arxiv_accuracy": 0.37703125, "eval_arxiv_bleu_score": 1.6326281046509203, "eval_arxiv_bleu_score_sem": 0.17242166390064012, "eval_arxiv_emb_cos_sim": 0.4795035719871521, "eval_arxiv_emb_cos_sim_sem": 0.018952349200844765, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4280660152435303, "eval_arxiv_n_ngrams_match_1": 12.1640625, "eval_arxiv_n_ngrams_match_2": 2.15625, "eval_arxiv_n_ngrams_match_3": 0.4375, "eval_arxiv_num_pred_words": 54.0625, "eval_arxiv_num_true_words": 85.8046875, "eval_arxiv_perplexity": 30.81698549996863, "eval_arxiv_pred_num_tokens": 125.5234375, "eval_arxiv_rouge_score": 0.16120658694933726, "eval_arxiv_runtime": 31.1474, "eval_arxiv_samples_per_second": 16.053, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.16247753379674024, "eval_arxiv_token_set_f1_sem": 0.00835908595377421, "eval_arxiv_token_set_precision": 0.10988237720037838, "eval_arxiv_token_set_recall": 0.4113042101200632, "eval_arxiv_true_num_tokens": 125.625, "step": 600000 }, { "epoch": 35.0938761186173, "eval_python_code_alpaca_accuracy": 0.131390625, "eval_python_code_alpaca_bleu_score": 28.634827668970694, "eval_python_code_alpaca_bleu_score_sem": 1.5808955362162487, "eval_python_code_alpaca_emb_cos_sim": 0.8623572587966919, "eval_python_code_alpaca_emb_cos_sim_sem": 0.00899417232722044, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.5049047470092773, "eval_python_code_alpaca_n_ngrams_match_1": 10.890625, "eval_python_code_alpaca_n_ngrams_match_2": 6.0546875, "eval_python_code_alpaca_n_ngrams_match_3": 3.375, "eval_python_code_alpaca_num_pred_words": 18.0546875, "eval_python_code_alpaca_num_true_words": 18.5703125, "eval_python_code_alpaca_perplexity": 4.503724616617267, "eval_python_code_alpaca_pred_num_tokens": 23.8515625, "eval_python_code_alpaca_rouge_score": 0.6149758369147849, "eval_python_code_alpaca_runtime": 30.4437, "eval_python_code_alpaca_samples_per_second": 16.424, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6264577775658229, "eval_python_code_alpaca_token_set_f1_sem": 0.012928897425646249, "eval_python_code_alpaca_token_set_precision": 0.620652216372618, "eval_python_code_alpaca_token_set_recall": 0.6364660984512768, "eval_python_code_alpaca_true_num_tokens": 23.8828125, "step": 600000 }, { "epoch": 35.0938761186173, "eval_wikibio_accuracy": 0.36959375, "eval_wikibio_bleu_score": 7.803750932449618, "eval_wikibio_bleu_score_sem": 0.724320551913231, "eval_wikibio_emb_cos_sim": 0.6058633923530579, "eval_wikibio_emb_cos_sim_sem": 0.02208566665649414, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7174408435821533, "eval_wikibio_n_ngrams_match_1": 15.265625, "eval_wikibio_n_ngrams_match_2": 5.34375, "eval_wikibio_n_ngrams_match_3": 2.3515625, "eval_wikibio_num_pred_words": 52.1796875, "eval_wikibio_num_true_words": 54.3515625, "eval_wikibio_perplexity": 15.14152309357991, "eval_wikibio_pred_num_tokens": 106.71875, "eval_wikibio_rouge_score": 0.29556212604332555, "eval_wikibio_runtime": 32.2531, "eval_wikibio_samples_per_second": 15.502, "eval_wikibio_steps_per_second": 0.031, "eval_wikibio_token_set_f1": 0.3190098190013597, "eval_wikibio_token_set_f1_sem": 0.012130374216619453, "eval_wikibio_token_set_precision": 0.2814319506347178, "eval_wikibio_token_set_recall": 0.4144894832501578, "eval_wikibio_true_num_tokens": 102.359375, "step": 600000 }, { "epoch": 35.0938761186173, "eval_msmarco_accuracy": 0.393953125, "eval_msmarco_bleu_score": 15.541650393073347, "eval_msmarco_bleu_score_sem": 1.2476892284856025, "eval_msmarco_emb_cos_sim": 0.8011553287506104, "eval_msmarco_emb_cos_sim_sem": 0.015223059803247452, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7706098556518555, "eval_msmarco_n_ngrams_match_1": 26.46875, "eval_msmarco_n_ngrams_match_2": 11.2734375, "eval_msmarco_n_ngrams_match_3": 6.125, "eval_msmarco_num_pred_words": 60.640625, "eval_msmarco_num_true_words": 58.0, "eval_msmarco_perplexity": 5.874434826463613, "eval_msmarco_pred_num_tokens": 81.8515625, "eval_msmarco_rouge_score": 0.435723593797263, "eval_msmarco_runtime": 26.333, "eval_msmarco_samples_per_second": 18.988, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.46149066976154285, "eval_msmarco_token_set_f1_sem": 0.013515329213490876, "eval_msmarco_token_set_precision": 0.43157707466935286, "eval_msmarco_token_set_recall": 0.5180157595746365, "eval_msmarco_true_num_tokens": 76.1796875, "step": 600000 }, { "epoch": 35.099725097970406, "grad_norm": 0.17716827988624573, "learning_rate": 0.001, "loss": 1.8471, "step": 600100 }, { "epoch": 35.10557407732351, "grad_norm": 0.1984236240386963, "learning_rate": 0.001, "loss": 1.8405, "step": 600200 }, { "epoch": 35.11142305667661, "grad_norm": 0.17290852963924408, "learning_rate": 0.001, "loss": 1.8465, "step": 600300 }, { "epoch": 35.11727203602971, "grad_norm": 0.19463278353214264, "learning_rate": 0.001, "loss": 1.8443, "step": 600400 }, { "epoch": 35.123121015382814, "grad_norm": 0.18760374188423157, "learning_rate": 0.001, "loss": 1.8469, "step": 600500 }, { "epoch": 35.12896999473592, "grad_norm": 0.19451260566711426, "learning_rate": 0.001, "loss": 1.8425, "step": 600600 }, { "epoch": 35.13481897408902, "grad_norm": 0.15343008935451508, "learning_rate": 0.001, "loss": 1.8386, "step": 600700 }, { "epoch": 35.140667953442126, "grad_norm": 0.23879963159561157, "learning_rate": 0.001, "loss": 1.845, "step": 600800 }, { "epoch": 35.14651693279523, "grad_norm": 0.28422608971595764, "learning_rate": 0.001, "loss": 1.8452, "step": 600900 }, { "epoch": 35.15236591214833, "grad_norm": 0.13912004232406616, "learning_rate": 0.001, "loss": 1.8435, "step": 601000 }, { "epoch": 35.15821489150143, "grad_norm": 0.24739642441272736, "learning_rate": 0.001, "loss": 1.8447, "step": 601100 }, { "epoch": 35.164063870854534, "grad_norm": 0.19606439769268036, "learning_rate": 0.001, "loss": 1.8449, "step": 601200 }, { "epoch": 35.16991285020764, "grad_norm": 0.1571727842092514, "learning_rate": 0.001, "loss": 1.8481, "step": 601300 }, { "epoch": 35.17576182956074, "grad_norm": 0.15966853499412537, "learning_rate": 0.001, "loss": 1.842, "step": 601400 }, { "epoch": 35.181610808913845, "grad_norm": 0.2263626605272293, "learning_rate": 0.001, "loss": 1.844, "step": 601500 }, { "epoch": 35.18745978826695, "grad_norm": 0.20400647819042206, "learning_rate": 0.001, "loss": 1.8436, "step": 601600 }, { "epoch": 35.19330876762005, "grad_norm": 0.17627735435962677, "learning_rate": 0.001, "loss": 1.8437, "step": 601700 }, { "epoch": 35.19915774697316, "grad_norm": 0.2371336668729782, "learning_rate": 0.001, "loss": 1.8484, "step": 601800 }, { "epoch": 35.20500672632625, "grad_norm": 0.22299809753894806, "learning_rate": 0.001, "loss": 1.8434, "step": 601900 }, { "epoch": 35.21085570567936, "grad_norm": 0.22717981040477753, "learning_rate": 0.001, "loss": 1.8471, "step": 602000 }, { "epoch": 35.21670468503246, "grad_norm": 0.1769733875989914, "learning_rate": 0.001, "loss": 1.8473, "step": 602100 }, { "epoch": 35.222553664385565, "grad_norm": 0.17540086805820465, "learning_rate": 0.001, "loss": 1.8464, "step": 602200 }, { "epoch": 35.22840264373867, "grad_norm": 0.16160671412944794, "learning_rate": 0.001, "loss": 1.84, "step": 602300 }, { "epoch": 35.23425162309177, "grad_norm": 0.22032827138900757, "learning_rate": 0.001, "loss": 1.8438, "step": 602400 }, { "epoch": 35.240100602444876, "grad_norm": 0.1890721321105957, "learning_rate": 0.001, "loss": 1.8441, "step": 602500 }, { "epoch": 35.24594958179797, "grad_norm": 0.17582900822162628, "learning_rate": 0.001, "loss": 1.8405, "step": 602600 }, { "epoch": 35.25179856115108, "grad_norm": 0.18739241361618042, "learning_rate": 0.001, "loss": 1.8575, "step": 602700 }, { "epoch": 35.25764754050418, "grad_norm": 0.18425656855106354, "learning_rate": 0.001, "loss": 1.8481, "step": 602800 }, { "epoch": 35.263496519857284, "grad_norm": 0.1771286129951477, "learning_rate": 0.001, "loss": 1.8494, "step": 602900 }, { "epoch": 35.26934549921039, "grad_norm": 0.16967831552028656, "learning_rate": 0.001, "loss": 1.8479, "step": 603000 }, { "epoch": 35.27519447856349, "grad_norm": 0.1943686604499817, "learning_rate": 0.001, "loss": 1.8491, "step": 603100 }, { "epoch": 35.281043457916596, "grad_norm": 0.1742018163204193, "learning_rate": 0.001, "loss": 1.8428, "step": 603200 }, { "epoch": 35.2868924372697, "grad_norm": 0.16188329458236694, "learning_rate": 0.001, "loss": 1.8485, "step": 603300 }, { "epoch": 35.292741416622796, "grad_norm": 0.18645405769348145, "learning_rate": 0.001, "loss": 1.8463, "step": 603400 }, { "epoch": 35.2985903959759, "grad_norm": 0.20367048680782318, "learning_rate": 0.001, "loss": 1.8503, "step": 603500 }, { "epoch": 35.304439375329004, "grad_norm": 0.22198273241519928, "learning_rate": 0.001, "loss": 1.8504, "step": 603600 }, { "epoch": 35.31028835468211, "grad_norm": 0.20382225513458252, "learning_rate": 0.001, "loss": 1.841, "step": 603700 }, { "epoch": 35.31613733403521, "grad_norm": 0.24345672130584717, "learning_rate": 0.001, "loss": 1.8442, "step": 603800 }, { "epoch": 35.321986313388315, "grad_norm": 0.20624156296253204, "learning_rate": 0.001, "loss": 1.8496, "step": 603900 }, { "epoch": 35.32783529274142, "grad_norm": 0.25829586386680603, "learning_rate": 0.001, "loss": 1.8463, "step": 604000 }, { "epoch": 35.33368427209452, "grad_norm": 0.21903344988822937, "learning_rate": 0.001, "loss": 1.853, "step": 604100 }, { "epoch": 35.33953325144762, "grad_norm": 0.23124156892299652, "learning_rate": 0.001, "loss": 1.8442, "step": 604200 }, { "epoch": 35.34538223080072, "grad_norm": 0.24154452979564667, "learning_rate": 0.001, "loss": 1.8478, "step": 604300 }, { "epoch": 35.35123121015383, "grad_norm": 0.2708202600479126, "learning_rate": 0.001, "loss": 1.8473, "step": 604400 }, { "epoch": 35.35708018950693, "grad_norm": 0.19766250252723694, "learning_rate": 0.001, "loss": 1.8449, "step": 604500 }, { "epoch": 35.362929168860035, "grad_norm": 0.20517182350158691, "learning_rate": 0.001, "loss": 1.8475, "step": 604600 }, { "epoch": 35.36877814821314, "grad_norm": 0.19072328507900238, "learning_rate": 0.001, "loss": 1.8455, "step": 604700 }, { "epoch": 35.37462712756624, "grad_norm": 0.20012958347797394, "learning_rate": 0.001, "loss": 1.85, "step": 604800 }, { "epoch": 35.380476106919346, "grad_norm": 0.20300602912902832, "learning_rate": 0.001, "loss": 1.8447, "step": 604900 }, { "epoch": 35.38632508627244, "grad_norm": 0.22068345546722412, "learning_rate": 0.001, "loss": 1.8494, "step": 605000 }, { "epoch": 35.39217406562555, "grad_norm": 0.18633683025836945, "learning_rate": 0.001, "loss": 1.8498, "step": 605100 }, { "epoch": 35.39802304497865, "grad_norm": 0.23521125316619873, "learning_rate": 0.001, "loss": 1.8505, "step": 605200 }, { "epoch": 35.403872024331754, "grad_norm": 0.19926562905311584, "learning_rate": 0.001, "loss": 1.8467, "step": 605300 }, { "epoch": 35.40972100368486, "grad_norm": 0.1975603848695755, "learning_rate": 0.001, "loss": 1.845, "step": 605400 }, { "epoch": 35.41556998303796, "grad_norm": 0.2527534067630768, "learning_rate": 0.001, "loss": 1.8517, "step": 605500 }, { "epoch": 35.421418962391066, "grad_norm": 0.18168075382709503, "learning_rate": 0.001, "loss": 1.8517, "step": 605600 }, { "epoch": 35.42726794174416, "grad_norm": 0.1972969025373459, "learning_rate": 0.001, "loss": 1.8448, "step": 605700 }, { "epoch": 35.433116921097266, "grad_norm": 0.21338598430156708, "learning_rate": 0.001, "loss": 1.8485, "step": 605800 }, { "epoch": 35.43896590045037, "grad_norm": 0.18188956379890442, "learning_rate": 0.001, "loss": 1.8442, "step": 605900 }, { "epoch": 35.444814879803474, "grad_norm": 0.16925093531608582, "learning_rate": 0.001, "loss": 1.8508, "step": 606000 }, { "epoch": 35.45066385915658, "grad_norm": 0.23560655117034912, "learning_rate": 0.001, "loss": 1.8516, "step": 606100 }, { "epoch": 35.45651283850968, "grad_norm": 0.24863630533218384, "learning_rate": 0.001, "loss": 1.8506, "step": 606200 }, { "epoch": 35.462361817862785, "grad_norm": 0.1583227813243866, "learning_rate": 0.001, "loss": 1.8432, "step": 606300 }, { "epoch": 35.46821079721589, "grad_norm": 0.22374267876148224, "learning_rate": 0.001, "loss": 1.8446, "step": 606400 }, { "epoch": 35.474059776568986, "grad_norm": 0.23149816691875458, "learning_rate": 0.001, "loss": 1.852, "step": 606500 }, { "epoch": 35.47990875592209, "grad_norm": 0.2574117183685303, "learning_rate": 0.001, "loss": 1.8483, "step": 606600 }, { "epoch": 35.48575773527519, "grad_norm": 0.18994073569774628, "learning_rate": 0.001, "loss": 1.8492, "step": 606700 }, { "epoch": 35.4916067146283, "grad_norm": 0.22026462852954865, "learning_rate": 0.001, "loss": 1.8522, "step": 606800 }, { "epoch": 35.4974556939814, "grad_norm": 0.18187403678894043, "learning_rate": 0.001, "loss": 1.8501, "step": 606900 }, { "epoch": 35.503304673334505, "grad_norm": 0.22444166243076324, "learning_rate": 0.001, "loss": 1.8531, "step": 607000 }, { "epoch": 35.50915365268761, "grad_norm": 0.19370798766613007, "learning_rate": 0.001, "loss": 1.8525, "step": 607100 }, { "epoch": 35.51500263204071, "grad_norm": 0.2238507717847824, "learning_rate": 0.001, "loss": 1.8446, "step": 607200 }, { "epoch": 35.52085161139381, "grad_norm": 0.19337758421897888, "learning_rate": 0.001, "loss": 1.852, "step": 607300 }, { "epoch": 35.52670059074691, "grad_norm": 0.19634288549423218, "learning_rate": 0.001, "loss": 1.8521, "step": 607400 }, { "epoch": 35.53254957010002, "grad_norm": 0.16802969574928284, "learning_rate": 0.001, "loss": 1.8497, "step": 607500 }, { "epoch": 35.53839854945312, "grad_norm": 0.26705020666122437, "learning_rate": 0.001, "loss": 1.8483, "step": 607600 }, { "epoch": 35.544247528806224, "grad_norm": 0.2056807428598404, "learning_rate": 0.001, "loss": 1.8507, "step": 607700 }, { "epoch": 35.55009650815933, "grad_norm": 0.20640648901462555, "learning_rate": 0.001, "loss": 1.8491, "step": 607800 }, { "epoch": 35.55594548751243, "grad_norm": 0.1578202098608017, "learning_rate": 0.001, "loss": 1.8506, "step": 607900 }, { "epoch": 35.561794466865535, "grad_norm": 0.2198212593793869, "learning_rate": 0.001, "loss": 1.8497, "step": 608000 }, { "epoch": 35.56764344621863, "grad_norm": 0.16051428020000458, "learning_rate": 0.001, "loss": 1.8504, "step": 608100 }, { "epoch": 35.573492425571736, "grad_norm": 0.20287510752677917, "learning_rate": 0.001, "loss": 1.8546, "step": 608200 }, { "epoch": 35.57934140492484, "grad_norm": 0.2394116371870041, "learning_rate": 0.001, "loss": 1.8542, "step": 608300 }, { "epoch": 35.585190384277944, "grad_norm": 0.20881210267543793, "learning_rate": 0.001, "loss": 1.8504, "step": 608400 }, { "epoch": 35.59103936363105, "grad_norm": 0.2042393684387207, "learning_rate": 0.001, "loss": 1.8526, "step": 608500 }, { "epoch": 35.59688834298415, "grad_norm": 0.21743638813495636, "learning_rate": 0.001, "loss": 1.8481, "step": 608600 }, { "epoch": 35.602737322337255, "grad_norm": 0.19933123886585236, "learning_rate": 0.001, "loss": 1.8446, "step": 608700 }, { "epoch": 35.60858630169035, "grad_norm": 0.24036690592765808, "learning_rate": 0.001, "loss": 1.8529, "step": 608800 }, { "epoch": 35.614435281043455, "grad_norm": 0.1839805543422699, "learning_rate": 0.001, "loss": 1.8477, "step": 608900 }, { "epoch": 35.62028426039656, "grad_norm": 0.16660456359386444, "learning_rate": 0.001, "loss": 1.8449, "step": 609000 }, { "epoch": 35.62613323974966, "grad_norm": 0.20754274725914001, "learning_rate": 0.001, "loss": 1.848, "step": 609100 }, { "epoch": 35.63198221910277, "grad_norm": 0.2186703383922577, "learning_rate": 0.001, "loss": 1.8571, "step": 609200 }, { "epoch": 35.63783119845587, "grad_norm": 0.22441120445728302, "learning_rate": 0.001, "loss": 1.8492, "step": 609300 }, { "epoch": 35.643680177808974, "grad_norm": 0.18450121581554413, "learning_rate": 0.001, "loss": 1.85, "step": 609400 }, { "epoch": 35.64952915716208, "grad_norm": 0.20254750549793243, "learning_rate": 0.001, "loss": 1.8453, "step": 609500 }, { "epoch": 35.655378136515175, "grad_norm": 0.21645112335681915, "learning_rate": 0.001, "loss": 1.8539, "step": 609600 }, { "epoch": 35.66122711586828, "grad_norm": 0.15852996706962585, "learning_rate": 0.001, "loss": 1.8512, "step": 609700 }, { "epoch": 35.66707609522138, "grad_norm": 0.22603574395179749, "learning_rate": 0.001, "loss": 1.8496, "step": 609800 }, { "epoch": 35.672925074574486, "grad_norm": 0.1958751529455185, "learning_rate": 0.001, "loss": 1.8572, "step": 609900 }, { "epoch": 35.67877405392759, "grad_norm": 0.17665033042430878, "learning_rate": 0.001, "loss": 1.8513, "step": 610000 }, { "epoch": 35.684623033280694, "grad_norm": 0.21614643931388855, "learning_rate": 0.001, "loss": 1.8538, "step": 610100 }, { "epoch": 35.6904720126338, "grad_norm": 0.21196305751800537, "learning_rate": 0.001, "loss": 1.8548, "step": 610200 }, { "epoch": 35.6963209919869, "grad_norm": 0.16474302113056183, "learning_rate": 0.001, "loss": 1.8517, "step": 610300 }, { "epoch": 35.70216997134, "grad_norm": 0.18187865614891052, "learning_rate": 0.001, "loss": 1.8511, "step": 610400 }, { "epoch": 35.7080189506931, "grad_norm": 0.18930600583553314, "learning_rate": 0.001, "loss": 1.8493, "step": 610500 }, { "epoch": 35.713867930046206, "grad_norm": 0.22274374961853027, "learning_rate": 0.001, "loss": 1.8551, "step": 610600 }, { "epoch": 35.71971690939931, "grad_norm": 0.20667824149131775, "learning_rate": 0.001, "loss": 1.8534, "step": 610700 }, { "epoch": 35.72556588875241, "grad_norm": 0.15997594594955444, "learning_rate": 0.001, "loss": 1.8487, "step": 610800 }, { "epoch": 35.73141486810552, "grad_norm": 0.18631570041179657, "learning_rate": 0.001, "loss": 1.8504, "step": 610900 }, { "epoch": 35.73726384745862, "grad_norm": 0.15623924136161804, "learning_rate": 0.001, "loss": 1.8525, "step": 611000 }, { "epoch": 35.743112826811725, "grad_norm": 0.20449981093406677, "learning_rate": 0.001, "loss": 1.8498, "step": 611100 }, { "epoch": 35.74896180616482, "grad_norm": 0.21993306279182434, "learning_rate": 0.001, "loss": 1.8549, "step": 611200 }, { "epoch": 35.754810785517925, "grad_norm": 0.19150780141353607, "learning_rate": 0.001, "loss": 1.8542, "step": 611300 }, { "epoch": 35.76065976487103, "grad_norm": 0.1656360626220703, "learning_rate": 0.001, "loss": 1.8536, "step": 611400 }, { "epoch": 35.76650874422413, "grad_norm": 0.16744136810302734, "learning_rate": 0.001, "loss": 1.8481, "step": 611500 }, { "epoch": 35.77235772357724, "grad_norm": 0.23247577250003815, "learning_rate": 0.001, "loss": 1.8491, "step": 611600 }, { "epoch": 35.77820670293034, "grad_norm": 0.23936229944229126, "learning_rate": 0.001, "loss": 1.8561, "step": 611700 }, { "epoch": 35.784055682283444, "grad_norm": 0.20229652523994446, "learning_rate": 0.001, "loss": 1.8466, "step": 611800 }, { "epoch": 35.78990466163654, "grad_norm": 0.21864403784275055, "learning_rate": 0.001, "loss": 1.8502, "step": 611900 }, { "epoch": 35.795753640989645, "grad_norm": 0.1901605725288391, "learning_rate": 0.001, "loss": 1.8527, "step": 612000 }, { "epoch": 35.80160262034275, "grad_norm": 0.20508186519145966, "learning_rate": 0.001, "loss": 1.8545, "step": 612100 }, { "epoch": 35.80745159969585, "grad_norm": 0.18075385689735413, "learning_rate": 0.001, "loss": 1.8523, "step": 612200 }, { "epoch": 35.813300579048956, "grad_norm": 0.2639322578907013, "learning_rate": 0.001, "loss": 1.8498, "step": 612300 }, { "epoch": 35.81914955840206, "grad_norm": 0.21843796968460083, "learning_rate": 0.001, "loss": 1.8512, "step": 612400 }, { "epoch": 35.824998537755164, "grad_norm": 0.21152564883232117, "learning_rate": 0.001, "loss": 1.8521, "step": 612500 }, { "epoch": 35.83084751710827, "grad_norm": 0.1680341362953186, "learning_rate": 0.001, "loss": 1.8515, "step": 612600 }, { "epoch": 35.836696496461364, "grad_norm": 0.20731930434703827, "learning_rate": 0.001, "loss": 1.8551, "step": 612700 }, { "epoch": 35.84254547581447, "grad_norm": 0.1894221156835556, "learning_rate": 0.001, "loss": 1.8537, "step": 612800 }, { "epoch": 35.84839445516757, "grad_norm": 0.2027530074119568, "learning_rate": 0.001, "loss": 1.8453, "step": 612900 }, { "epoch": 35.854243434520676, "grad_norm": 0.1848914921283722, "learning_rate": 0.001, "loss": 1.8505, "step": 613000 }, { "epoch": 35.86009241387378, "grad_norm": 0.19472043216228485, "learning_rate": 0.001, "loss": 1.849, "step": 613100 }, { "epoch": 35.86594139322688, "grad_norm": 0.1739959567785263, "learning_rate": 0.001, "loss": 1.8517, "step": 613200 }, { "epoch": 35.87179037257999, "grad_norm": 0.2274477481842041, "learning_rate": 0.001, "loss": 1.8538, "step": 613300 }, { "epoch": 35.87763935193309, "grad_norm": 0.1528811901807785, "learning_rate": 0.001, "loss": 1.8546, "step": 613400 }, { "epoch": 35.88348833128619, "grad_norm": 0.20379063487052917, "learning_rate": 0.001, "loss": 1.8507, "step": 613500 }, { "epoch": 35.88933731063929, "grad_norm": 0.20322920382022858, "learning_rate": 0.001, "loss": 1.8534, "step": 613600 }, { "epoch": 35.895186289992395, "grad_norm": 0.15970419347286224, "learning_rate": 0.001, "loss": 1.8536, "step": 613700 }, { "epoch": 35.9010352693455, "grad_norm": 0.20924319326877594, "learning_rate": 0.001, "loss": 1.8564, "step": 613800 }, { "epoch": 35.9068842486986, "grad_norm": 0.16961701214313507, "learning_rate": 0.001, "loss": 1.8471, "step": 613900 }, { "epoch": 35.91273322805171, "grad_norm": 0.21879278123378754, "learning_rate": 0.001, "loss": 1.8529, "step": 614000 }, { "epoch": 35.91858220740481, "grad_norm": 0.23767611384391785, "learning_rate": 0.001, "loss": 1.8504, "step": 614100 }, { "epoch": 35.924431186757914, "grad_norm": 0.23100240528583527, "learning_rate": 0.001, "loss": 1.8495, "step": 614200 }, { "epoch": 35.93028016611101, "grad_norm": 0.1787976175546646, "learning_rate": 0.001, "loss": 1.8495, "step": 614300 }, { "epoch": 35.936129145464115, "grad_norm": 0.19698947668075562, "learning_rate": 0.001, "loss": 1.8527, "step": 614400 }, { "epoch": 35.94197812481722, "grad_norm": 0.19839435815811157, "learning_rate": 0.001, "loss": 1.8475, "step": 614500 }, { "epoch": 35.94782710417032, "grad_norm": 0.2061050683259964, "learning_rate": 0.001, "loss": 1.8534, "step": 614600 }, { "epoch": 35.953676083523426, "grad_norm": 0.175352543592453, "learning_rate": 0.001, "loss": 1.8486, "step": 614700 }, { "epoch": 35.95952506287653, "grad_norm": 0.1727651208639145, "learning_rate": 0.001, "loss": 1.8569, "step": 614800 }, { "epoch": 35.965374042229634, "grad_norm": 0.21341046690940857, "learning_rate": 0.001, "loss": 1.8529, "step": 614900 }, { "epoch": 35.97122302158273, "grad_norm": 0.19559337198734283, "learning_rate": 0.001, "loss": 1.8495, "step": 615000 }, { "epoch": 35.977072000935834, "grad_norm": 0.1734468787908554, "learning_rate": 0.001, "loss": 1.8521, "step": 615100 }, { "epoch": 35.98292098028894, "grad_norm": 0.21057650446891785, "learning_rate": 0.001, "loss": 1.855, "step": 615200 }, { "epoch": 35.98876995964204, "grad_norm": 0.17788809537887573, "learning_rate": 0.001, "loss": 1.8536, "step": 615300 }, { "epoch": 35.994618938995146, "grad_norm": 0.21178707480430603, "learning_rate": 0.001, "loss": 1.8531, "step": 615400 }, { "epoch": 36.00046791834825, "grad_norm": 0.24890415370464325, "learning_rate": 0.001, "loss": 1.8569, "step": 615500 }, { "epoch": 36.00631689770135, "grad_norm": 0.22124065458774567, "learning_rate": 0.001, "loss": 1.8401, "step": 615600 }, { "epoch": 36.01216587705446, "grad_norm": 0.23819170892238617, "learning_rate": 0.001, "loss": 1.8352, "step": 615700 }, { "epoch": 36.018014856407554, "grad_norm": 0.20598198473453522, "learning_rate": 0.001, "loss": 1.8351, "step": 615800 }, { "epoch": 36.02386383576066, "grad_norm": 0.21152730286121368, "learning_rate": 0.001, "loss": 1.8365, "step": 615900 }, { "epoch": 36.02971281511376, "grad_norm": 0.17813852429389954, "learning_rate": 0.001, "loss": 1.8434, "step": 616000 }, { "epoch": 36.035561794466865, "grad_norm": 0.17403823137283325, "learning_rate": 0.001, "loss": 1.8393, "step": 616100 }, { "epoch": 36.04141077381997, "grad_norm": 0.150756374001503, "learning_rate": 0.001, "loss": 1.8426, "step": 616200 }, { "epoch": 36.04725975317307, "grad_norm": 0.2378218024969101, "learning_rate": 0.001, "loss": 1.8429, "step": 616300 }, { "epoch": 36.05310873252618, "grad_norm": 0.21709619462490082, "learning_rate": 0.001, "loss": 1.8353, "step": 616400 }, { "epoch": 36.05895771187928, "grad_norm": 0.20821312069892883, "learning_rate": 0.001, "loss": 1.8329, "step": 616500 }, { "epoch": 36.06480669123238, "grad_norm": 0.22413526475429535, "learning_rate": 0.001, "loss": 1.8421, "step": 616600 }, { "epoch": 36.07065567058548, "grad_norm": 0.244819775223732, "learning_rate": 0.001, "loss": 1.845, "step": 616700 }, { "epoch": 36.076504649938585, "grad_norm": 0.21584822237491608, "learning_rate": 0.001, "loss": 1.8382, "step": 616800 }, { "epoch": 36.08235362929169, "grad_norm": 0.18839220702648163, "learning_rate": 0.001, "loss": 1.8466, "step": 616900 }, { "epoch": 36.08820260864479, "grad_norm": 0.17195510864257812, "learning_rate": 0.001, "loss": 1.8381, "step": 617000 }, { "epoch": 36.094051587997896, "grad_norm": 0.21728213131427765, "learning_rate": 0.001, "loss": 1.8382, "step": 617100 }, { "epoch": 36.099900567351, "grad_norm": 0.14341384172439575, "learning_rate": 0.001, "loss": 1.8406, "step": 617200 }, { "epoch": 36.105749546704104, "grad_norm": 0.16209104657173157, "learning_rate": 0.001, "loss": 1.8387, "step": 617300 }, { "epoch": 36.1115985260572, "grad_norm": 0.2075880914926529, "learning_rate": 0.001, "loss": 1.8361, "step": 617400 }, { "epoch": 36.117447505410304, "grad_norm": 0.1932358741760254, "learning_rate": 0.001, "loss": 1.8341, "step": 617500 }, { "epoch": 36.12329648476341, "grad_norm": 0.2479132115840912, "learning_rate": 0.001, "loss": 1.8476, "step": 617600 }, { "epoch": 36.12914546411651, "grad_norm": 0.2162671685218811, "learning_rate": 0.001, "loss": 1.8423, "step": 617700 }, { "epoch": 36.134994443469616, "grad_norm": 0.2036142349243164, "learning_rate": 0.001, "loss": 1.8377, "step": 617800 }, { "epoch": 36.14084342282272, "grad_norm": 0.15680694580078125, "learning_rate": 0.001, "loss": 1.8415, "step": 617900 }, { "epoch": 36.14669240217582, "grad_norm": 0.21134115755558014, "learning_rate": 0.001, "loss": 1.8409, "step": 618000 }, { "epoch": 36.15254138152892, "grad_norm": 0.19764330983161926, "learning_rate": 0.001, "loss": 1.8439, "step": 618100 }, { "epoch": 36.158390360882024, "grad_norm": 0.1948583424091339, "learning_rate": 0.001, "loss": 1.8409, "step": 618200 }, { "epoch": 36.16423934023513, "grad_norm": 0.17318032681941986, "learning_rate": 0.001, "loss": 1.8387, "step": 618300 }, { "epoch": 36.17008831958823, "grad_norm": 0.1921614408493042, "learning_rate": 0.001, "loss": 1.8458, "step": 618400 }, { "epoch": 36.175937298941335, "grad_norm": 0.1774672418832779, "learning_rate": 0.001, "loss": 1.8419, "step": 618500 }, { "epoch": 36.18178627829444, "grad_norm": 0.19026246666908264, "learning_rate": 0.001, "loss": 1.8417, "step": 618600 }, { "epoch": 36.18763525764754, "grad_norm": 0.20149314403533936, "learning_rate": 0.001, "loss": 1.8424, "step": 618700 }, { "epoch": 36.193484237000646, "grad_norm": 0.23433798551559448, "learning_rate": 0.001, "loss": 1.8373, "step": 618800 }, { "epoch": 36.19933321635374, "grad_norm": 0.20684818923473358, "learning_rate": 0.001, "loss": 1.8432, "step": 618900 }, { "epoch": 36.20518219570685, "grad_norm": 0.2165238857269287, "learning_rate": 0.001, "loss": 1.8416, "step": 619000 }, { "epoch": 36.21103117505995, "grad_norm": 0.24058926105499268, "learning_rate": 0.001, "loss": 1.8386, "step": 619100 }, { "epoch": 36.216880154413055, "grad_norm": 0.27012312412261963, "learning_rate": 0.001, "loss": 1.8403, "step": 619200 }, { "epoch": 36.22272913376616, "grad_norm": 0.21553444862365723, "learning_rate": 0.001, "loss": 1.8436, "step": 619300 }, { "epoch": 36.22857811311926, "grad_norm": 0.20279888808727264, "learning_rate": 0.001, "loss": 1.8402, "step": 619400 }, { "epoch": 36.234427092472366, "grad_norm": 0.1599474996328354, "learning_rate": 0.001, "loss": 1.8408, "step": 619500 }, { "epoch": 36.24027607182547, "grad_norm": 0.18435515463352203, "learning_rate": 0.001, "loss": 1.8442, "step": 619600 }, { "epoch": 36.246125051178566, "grad_norm": 0.18137463927268982, "learning_rate": 0.001, "loss": 1.8389, "step": 619700 }, { "epoch": 36.25197403053167, "grad_norm": 0.1721472144126892, "learning_rate": 0.001, "loss": 1.845, "step": 619800 }, { "epoch": 36.257823009884774, "grad_norm": 0.19749818742275238, "learning_rate": 0.001, "loss": 1.8506, "step": 619900 }, { "epoch": 36.26367198923788, "grad_norm": 0.1977318674325943, "learning_rate": 0.001, "loss": 1.8475, "step": 620000 }, { "epoch": 36.26952096859098, "grad_norm": 0.17386579513549805, "learning_rate": 0.001, "loss": 1.8403, "step": 620100 }, { "epoch": 36.275369947944085, "grad_norm": 0.20175744593143463, "learning_rate": 0.001, "loss": 1.8405, "step": 620200 }, { "epoch": 36.28121892729719, "grad_norm": 0.19818101823329926, "learning_rate": 0.001, "loss": 1.8438, "step": 620300 }, { "epoch": 36.28706790665029, "grad_norm": 0.18241220712661743, "learning_rate": 0.001, "loss": 1.842, "step": 620400 }, { "epoch": 36.29291688600339, "grad_norm": 0.19019818305969238, "learning_rate": 0.001, "loss": 1.8516, "step": 620500 }, { "epoch": 36.29876586535649, "grad_norm": 0.18447162210941315, "learning_rate": 0.001, "loss": 1.8413, "step": 620600 }, { "epoch": 36.3046148447096, "grad_norm": 0.23467250168323517, "learning_rate": 0.001, "loss": 1.8456, "step": 620700 }, { "epoch": 36.3104638240627, "grad_norm": 0.2361362725496292, "learning_rate": 0.001, "loss": 1.8454, "step": 620800 }, { "epoch": 36.316312803415805, "grad_norm": 0.19939424097537994, "learning_rate": 0.001, "loss": 1.8492, "step": 620900 }, { "epoch": 36.32216178276891, "grad_norm": 0.1998232901096344, "learning_rate": 0.001, "loss": 1.8412, "step": 621000 }, { "epoch": 36.32801076212201, "grad_norm": 0.19854187965393066, "learning_rate": 0.001, "loss": 1.841, "step": 621100 }, { "epoch": 36.33385974147511, "grad_norm": 0.2549595236778259, "learning_rate": 0.001, "loss": 1.8412, "step": 621200 }, { "epoch": 36.33970872082821, "grad_norm": 0.20019511878490448, "learning_rate": 0.001, "loss": 1.8449, "step": 621300 }, { "epoch": 36.34555770018132, "grad_norm": 0.18120773136615753, "learning_rate": 0.001, "loss": 1.8426, "step": 621400 }, { "epoch": 36.35140667953442, "grad_norm": 0.24543873965740204, "learning_rate": 0.001, "loss": 1.8437, "step": 621500 }, { "epoch": 36.357255658887524, "grad_norm": 0.16388174891471863, "learning_rate": 0.001, "loss": 1.8441, "step": 621600 }, { "epoch": 36.36310463824063, "grad_norm": 0.16767586767673492, "learning_rate": 0.001, "loss": 1.8379, "step": 621700 }, { "epoch": 36.36895361759373, "grad_norm": 0.18620741367340088, "learning_rate": 0.001, "loss": 1.8459, "step": 621800 }, { "epoch": 36.374802596946836, "grad_norm": 0.1992681324481964, "learning_rate": 0.001, "loss": 1.849, "step": 621900 }, { "epoch": 36.38065157629993, "grad_norm": 0.18923532962799072, "learning_rate": 0.001, "loss": 1.8478, "step": 622000 }, { "epoch": 36.386500555653036, "grad_norm": 0.18788912892341614, "learning_rate": 0.001, "loss": 1.8438, "step": 622100 }, { "epoch": 36.39234953500614, "grad_norm": 0.18265023827552795, "learning_rate": 0.001, "loss": 1.8499, "step": 622200 }, { "epoch": 36.398198514359244, "grad_norm": 0.1725882887840271, "learning_rate": 0.001, "loss": 1.8453, "step": 622300 }, { "epoch": 36.40404749371235, "grad_norm": 0.20871998369693756, "learning_rate": 0.001, "loss": 1.8528, "step": 622400 }, { "epoch": 36.40989647306545, "grad_norm": 0.18228697776794434, "learning_rate": 0.001, "loss": 1.8442, "step": 622500 }, { "epoch": 36.415745452418555, "grad_norm": 0.19889859855175018, "learning_rate": 0.001, "loss": 1.8455, "step": 622600 }, { "epoch": 36.42159443177166, "grad_norm": 0.22466957569122314, "learning_rate": 0.001, "loss": 1.8439, "step": 622700 }, { "epoch": 36.427443411124756, "grad_norm": 0.1940121203660965, "learning_rate": 0.001, "loss": 1.839, "step": 622800 }, { "epoch": 36.43329239047786, "grad_norm": 0.25477954745292664, "learning_rate": 0.001, "loss": 1.8462, "step": 622900 }, { "epoch": 36.43914136983096, "grad_norm": 0.2168007344007492, "learning_rate": 0.001, "loss": 1.8434, "step": 623000 }, { "epoch": 36.44499034918407, "grad_norm": 0.16983367502689362, "learning_rate": 0.001, "loss": 1.8393, "step": 623100 }, { "epoch": 36.45083932853717, "grad_norm": 0.20215243101119995, "learning_rate": 0.001, "loss": 1.8496, "step": 623200 }, { "epoch": 36.456688307890275, "grad_norm": 0.1711164265871048, "learning_rate": 0.001, "loss": 1.8478, "step": 623300 }, { "epoch": 36.46253728724338, "grad_norm": 0.20272670686244965, "learning_rate": 0.001, "loss": 1.8494, "step": 623400 }, { "epoch": 36.46838626659648, "grad_norm": 0.1719396710395813, "learning_rate": 0.001, "loss": 1.8444, "step": 623500 }, { "epoch": 36.47423524594958, "grad_norm": 0.1706494688987732, "learning_rate": 0.001, "loss": 1.8445, "step": 623600 }, { "epoch": 36.48008422530268, "grad_norm": 0.21377551555633545, "learning_rate": 0.001, "loss": 1.8478, "step": 623700 }, { "epoch": 36.48593320465579, "grad_norm": 0.18905766308307648, "learning_rate": 0.001, "loss": 1.8515, "step": 623800 }, { "epoch": 36.49178218400889, "grad_norm": 0.20486441254615784, "learning_rate": 0.001, "loss": 1.847, "step": 623900 }, { "epoch": 36.497631163361994, "grad_norm": 0.19798964262008667, "learning_rate": 0.001, "loss": 1.8465, "step": 624000 }, { "epoch": 36.5034801427151, "grad_norm": 0.19030149281024933, "learning_rate": 0.001, "loss": 1.8438, "step": 624100 }, { "epoch": 36.5093291220682, "grad_norm": 0.22747330367565155, "learning_rate": 0.001, "loss": 1.8488, "step": 624200 }, { "epoch": 36.5151781014213, "grad_norm": 0.2399371713399887, "learning_rate": 0.001, "loss": 1.8445, "step": 624300 }, { "epoch": 36.5210270807744, "grad_norm": 0.16390447318553925, "learning_rate": 0.001, "loss": 1.8436, "step": 624400 }, { "epoch": 36.526876060127506, "grad_norm": 0.19478319585323334, "learning_rate": 0.001, "loss": 1.8445, "step": 624500 }, { "epoch": 36.53272503948061, "grad_norm": 0.21451519429683685, "learning_rate": 0.001, "loss": 1.8522, "step": 624600 }, { "epoch": 36.538574018833714, "grad_norm": 0.17581886053085327, "learning_rate": 0.001, "loss": 1.8467, "step": 624700 }, { "epoch": 36.54442299818682, "grad_norm": 0.2254980206489563, "learning_rate": 0.001, "loss": 1.8457, "step": 624800 }, { "epoch": 36.55027197753992, "grad_norm": 0.19213779270648956, "learning_rate": 0.001, "loss": 1.8461, "step": 624900 }, { "epoch": 36.556120956893025, "grad_norm": 0.12433925271034241, "learning_rate": 0.001, "loss": 1.8412, "step": 625000 }, { "epoch": 36.556120956893025, "eval_ag_news_accuracy": 0.241015625, "eval_ag_news_bleu_score": 8.086556599134614, "eval_ag_news_bleu_score_sem": 0.5980317568050795, "eval_ag_news_emb_cos_sim": 0.7227737307548523, "eval_ag_news_emb_cos_sim_sem": 0.01353839598596096, "eval_ag_news_emb_top1_equal": 0.9609375, "eval_ag_news_emb_top1_equal_sem": 0.017191974446177483, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.740386724472046, "eval_ag_news_n_ngrams_match_1": 14.1484375, "eval_ag_news_n_ngrams_match_2": 4.421875, "eval_ag_news_n_ngrams_match_3": 1.8515625, "eval_ag_news_num_pred_words": 45.03125, "eval_ag_news_num_true_words": 43.984375, "eval_ag_news_perplexity": 15.492975450708556, "eval_ag_news_pred_num_tokens": 68.03125, "eval_ag_news_rouge_score": 0.3181019686988128, "eval_ag_news_runtime": 36.7415, "eval_ag_news_samples_per_second": 13.609, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3532676842743384, "eval_ag_news_token_set_f1_sem": 0.010190188675926888, "eval_ag_news_token_set_precision": 0.3320512085264629, "eval_ag_news_token_set_recall": 0.388524813799282, "eval_ag_news_true_num_tokens": 60.671875, "step": 625000 }, { "epoch": 36.556120956893025, "eval_anthropic_toxic_prompts_accuracy": 0.10303125, "eval_anthropic_toxic_prompts_bleu_score": 43.39447563889041, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.708630229177885, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8797879815101624, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011585860513150692, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1484375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0315484639796987, "eval_anthropic_toxic_prompts_loss": 1.2814868688583374, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.453125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.796875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.828125, "eval_anthropic_toxic_prompts_num_pred_words": 15.890625, "eval_anthropic_toxic_prompts_num_true_words": 15.2890625, "eval_anthropic_toxic_prompts_perplexity": 3.6019914348308557, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.1015625, "eval_anthropic_toxic_prompts_rouge_score": 0.6846391983366967, "eval_anthropic_toxic_prompts_runtime": 28.4815, "eval_anthropic_toxic_prompts_samples_per_second": 17.555, "eval_anthropic_toxic_prompts_steps_per_second": 0.035, "eval_anthropic_toxic_prompts_token_set_f1": 0.7002520351895087, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018783097288295997, "eval_anthropic_toxic_prompts_token_set_precision": 0.6892576198860282, "eval_anthropic_toxic_prompts_token_set_recall": 0.7179005993701458, "eval_anthropic_toxic_prompts_true_num_tokens": 18.6484375, "step": 625000 }, { "epoch": 36.556120956893025, "eval_arxiv_accuracy": 0.37621875, "eval_arxiv_bleu_score": 1.7229786885936427, "eval_arxiv_bleu_score_sem": 0.1494279195079929, "eval_arxiv_emb_cos_sim": 0.468191921710968, "eval_arxiv_emb_cos_sim_sem": 0.019831178709864616, "eval_arxiv_emb_top1_equal": 0.890625, "eval_arxiv_emb_top1_equal_sem": 0.02769520878791809, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4190609455108643, "eval_arxiv_n_ngrams_match_1": 13.5859375, "eval_arxiv_n_ngrams_match_2": 2.4453125, "eval_arxiv_n_ngrams_match_3": 0.5234375, "eval_arxiv_num_pred_words": 54.7265625, "eval_arxiv_num_true_words": 86.453125, "eval_arxiv_perplexity": 30.54072214883817, "eval_arxiv_pred_num_tokens": 125.1640625, "eval_arxiv_rouge_score": 0.17991190958216802, "eval_arxiv_runtime": 29.8302, "eval_arxiv_samples_per_second": 16.762, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.18329845380835263, "eval_arxiv_token_set_f1_sem": 0.008483906690651237, "eval_arxiv_token_set_precision": 0.12525649236382852, "eval_arxiv_token_set_recall": 0.42488507249801666, "eval_arxiv_true_num_tokens": 124.6875, "step": 625000 }, { "epoch": 36.556120956893025, "eval_python_code_alpaca_accuracy": 0.1258125, "eval_python_code_alpaca_bleu_score": 26.008176549890877, "eval_python_code_alpaca_bleu_score_sem": 1.6058581306380382, "eval_python_code_alpaca_emb_cos_sim": 0.8589215278625488, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011072871275246143, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.467571496963501, "eval_python_code_alpaca_n_ngrams_match_1": 10.296875, "eval_python_code_alpaca_n_ngrams_match_2": 5.5625, "eval_python_code_alpaca_n_ngrams_match_3": 3.15625, "eval_python_code_alpaca_num_pred_words": 17.6953125, "eval_python_code_alpaca_num_true_words": 19.359375, "eval_python_code_alpaca_perplexity": 4.338685825023899, "eval_python_code_alpaca_pred_num_tokens": 23.265625, "eval_python_code_alpaca_rouge_score": 0.5708721510825856, "eval_python_code_alpaca_runtime": 29.1393, "eval_python_code_alpaca_samples_per_second": 17.159, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.5913043444961031, "eval_python_code_alpaca_token_set_f1_sem": 0.013427812319327546, "eval_python_code_alpaca_token_set_precision": 0.5714813568852365, "eval_python_code_alpaca_token_set_recall": 0.6192734064165675, "eval_python_code_alpaca_true_num_tokens": 24.5390625, "step": 625000 }, { "epoch": 36.556120956893025, "eval_wikibio_accuracy": 0.3635625, "eval_wikibio_bleu_score": 8.148195013456137, "eval_wikibio_bleu_score_sem": 0.7686706818696083, "eval_wikibio_emb_cos_sim": 0.636623740196228, "eval_wikibio_emb_cos_sim_sem": 0.021511094644665718, "eval_wikibio_emb_top1_equal": 0.96875, "eval_wikibio_emb_top1_equal_sem": 0.01543935015797615, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.711331605911255, "eval_wikibio_n_ngrams_match_1": 14.6328125, "eval_wikibio_n_ngrams_match_2": 5.0546875, "eval_wikibio_n_ngrams_match_3": 2.1796875, "eval_wikibio_num_pred_words": 48.234375, "eval_wikibio_num_true_words": 50.734375, "eval_wikibio_perplexity": 15.049301917671869, "eval_wikibio_pred_num_tokens": 101.09375, "eval_wikibio_rouge_score": 0.3023261174142039, "eval_wikibio_runtime": 30.0277, "eval_wikibio_samples_per_second": 16.651, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.32519548809323173, "eval_wikibio_token_set_f1_sem": 0.013035149011135267, "eval_wikibio_token_set_precision": 0.28861294050247205, "eval_wikibio_token_set_recall": 0.4239523151713917, "eval_wikibio_true_num_tokens": 98.046875, "step": 625000 }, { "epoch": 36.556120956893025, "eval_msmarco_accuracy": 0.393078125, "eval_msmarco_bleu_score": 17.873850743159736, "eval_msmarco_bleu_score_sem": 1.349397143333645, "eval_msmarco_emb_cos_sim": 0.7903698682785034, "eval_msmarco_emb_cos_sim_sem": 0.01579904370009899, "eval_msmarco_emb_top1_equal": 0.9375, "eval_msmarco_emb_top1_equal_sem": 0.02147948183119297, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7895334959030151, "eval_msmarco_n_ngrams_match_1": 28.7890625, "eval_msmarco_n_ngrams_match_2": 13.46875, "eval_msmarco_n_ngrams_match_3": 7.8125, "eval_msmarco_num_pred_words": 62.7890625, "eval_msmarco_num_true_words": 62.140625, "eval_msmarco_perplexity": 5.9866590138980245, "eval_msmarco_pred_num_tokens": 85.7109375, "eval_msmarco_rouge_score": 0.4420756274983825, "eval_msmarco_runtime": 25.4931, "eval_msmarco_samples_per_second": 19.613, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.47915977628597456, "eval_msmarco_token_set_f1_sem": 0.013517293020671103, "eval_msmarco_token_set_precision": 0.4417547539920227, "eval_msmarco_token_set_recall": 0.5479770519784808, "eval_msmarco_true_num_tokens": 81.3984375, "step": 625000 }, { "epoch": 36.56196993624612, "grad_norm": 0.18038305640220642, "learning_rate": 0.001, "loss": 1.8455, "step": 625100 }, { "epoch": 36.567818915599226, "grad_norm": 0.23727348446846008, "learning_rate": 0.001, "loss": 1.846, "step": 625200 }, { "epoch": 36.57366789495233, "grad_norm": 0.18653982877731323, "learning_rate": 0.001, "loss": 1.8543, "step": 625300 }, { "epoch": 36.57951687430543, "grad_norm": 0.1689344197511673, "learning_rate": 0.001, "loss": 1.8464, "step": 625400 }, { "epoch": 36.58536585365854, "grad_norm": 0.1693335771560669, "learning_rate": 0.001, "loss": 1.8458, "step": 625500 }, { "epoch": 36.59121483301164, "grad_norm": 0.1664956957101822, "learning_rate": 0.001, "loss": 1.8489, "step": 625600 }, { "epoch": 36.597063812364745, "grad_norm": 0.2011990249156952, "learning_rate": 0.001, "loss": 1.8418, "step": 625700 }, { "epoch": 36.60291279171785, "grad_norm": 0.23103870451450348, "learning_rate": 0.001, "loss": 1.8471, "step": 625800 }, { "epoch": 36.608761771070945, "grad_norm": 0.22659499943256378, "learning_rate": 0.001, "loss": 1.8464, "step": 625900 }, { "epoch": 36.61461075042405, "grad_norm": 0.2340521663427353, "learning_rate": 0.001, "loss": 1.8503, "step": 626000 }, { "epoch": 36.62045972977715, "grad_norm": 0.16727010905742645, "learning_rate": 0.001, "loss": 1.8495, "step": 626100 }, { "epoch": 36.62630870913026, "grad_norm": 0.17731694877147675, "learning_rate": 0.001, "loss": 1.8442, "step": 626200 }, { "epoch": 36.63215768848336, "grad_norm": 0.17644637823104858, "learning_rate": 0.001, "loss": 1.8428, "step": 626300 }, { "epoch": 36.638006667836464, "grad_norm": 0.19046032428741455, "learning_rate": 0.001, "loss": 1.8391, "step": 626400 }, { "epoch": 36.64385564718957, "grad_norm": 0.18092520534992218, "learning_rate": 0.001, "loss": 1.8471, "step": 626500 }, { "epoch": 36.64970462654267, "grad_norm": 0.17449775338172913, "learning_rate": 0.001, "loss": 1.8496, "step": 626600 }, { "epoch": 36.65555360589577, "grad_norm": 0.20355042815208435, "learning_rate": 0.001, "loss": 1.8454, "step": 626700 }, { "epoch": 36.66140258524887, "grad_norm": 0.17105698585510254, "learning_rate": 0.001, "loss": 1.8448, "step": 626800 }, { "epoch": 36.667251564601976, "grad_norm": 0.2412824034690857, "learning_rate": 0.001, "loss": 1.858, "step": 626900 }, { "epoch": 36.67310054395508, "grad_norm": 0.16429263353347778, "learning_rate": 0.001, "loss": 1.8467, "step": 627000 }, { "epoch": 36.678949523308184, "grad_norm": 0.3935217559337616, "learning_rate": 0.001, "loss": 1.8448, "step": 627100 }, { "epoch": 36.68479850266129, "grad_norm": 0.18425798416137695, "learning_rate": 0.001, "loss": 1.853, "step": 627200 }, { "epoch": 36.69064748201439, "grad_norm": 0.20897476375102997, "learning_rate": 0.001, "loss": 1.8439, "step": 627300 }, { "epoch": 36.69649646136749, "grad_norm": 0.21998675167560577, "learning_rate": 0.001, "loss": 1.847, "step": 627400 }, { "epoch": 36.70234544072059, "grad_norm": 0.18067383766174316, "learning_rate": 0.001, "loss": 1.8471, "step": 627500 }, { "epoch": 36.708194420073696, "grad_norm": 0.2319432646036148, "learning_rate": 0.001, "loss": 1.8511, "step": 627600 }, { "epoch": 36.7140433994268, "grad_norm": 0.16931182146072388, "learning_rate": 0.001, "loss": 1.8467, "step": 627700 }, { "epoch": 36.7198923787799, "grad_norm": 0.2067161202430725, "learning_rate": 0.001, "loss": 1.8504, "step": 627800 }, { "epoch": 36.72574135813301, "grad_norm": 0.26108816266059875, "learning_rate": 0.001, "loss": 1.8456, "step": 627900 }, { "epoch": 36.73159033748611, "grad_norm": 0.2300434112548828, "learning_rate": 0.001, "loss": 1.8448, "step": 628000 }, { "epoch": 36.737439316839215, "grad_norm": 0.22500373423099518, "learning_rate": 0.001, "loss": 1.8514, "step": 628100 }, { "epoch": 36.74328829619231, "grad_norm": 0.2604440450668335, "learning_rate": 0.001, "loss": 1.8522, "step": 628200 }, { "epoch": 36.749137275545415, "grad_norm": 0.2473919689655304, "learning_rate": 0.001, "loss": 1.8483, "step": 628300 }, { "epoch": 36.75498625489852, "grad_norm": 0.15241114795207977, "learning_rate": 0.001, "loss": 1.8464, "step": 628400 }, { "epoch": 36.76083523425162, "grad_norm": 0.22520634531974792, "learning_rate": 0.001, "loss": 1.8507, "step": 628500 }, { "epoch": 36.76668421360473, "grad_norm": 0.2402704805135727, "learning_rate": 0.001, "loss": 1.8496, "step": 628600 }, { "epoch": 36.77253319295783, "grad_norm": 0.2239837348461151, "learning_rate": 0.001, "loss": 1.8456, "step": 628700 }, { "epoch": 36.778382172310934, "grad_norm": 0.21830560266971588, "learning_rate": 0.001, "loss": 1.8463, "step": 628800 }, { "epoch": 36.78423115166404, "grad_norm": 0.18464455008506775, "learning_rate": 0.001, "loss": 1.8439, "step": 628900 }, { "epoch": 36.790080131017135, "grad_norm": 0.16892407834529877, "learning_rate": 0.001, "loss": 1.8532, "step": 629000 }, { "epoch": 36.79592911037024, "grad_norm": 0.19905155897140503, "learning_rate": 0.001, "loss": 1.8431, "step": 629100 }, { "epoch": 36.80177808972334, "grad_norm": 0.2011309713125229, "learning_rate": 0.001, "loss": 1.8507, "step": 629200 }, { "epoch": 36.807627069076446, "grad_norm": 0.15519049763679504, "learning_rate": 0.001, "loss": 1.8465, "step": 629300 }, { "epoch": 36.81347604842955, "grad_norm": 0.18231859803199768, "learning_rate": 0.001, "loss": 1.8462, "step": 629400 }, { "epoch": 36.819325027782654, "grad_norm": 0.250644713640213, "learning_rate": 0.001, "loss": 1.8512, "step": 629500 }, { "epoch": 36.82517400713576, "grad_norm": 0.169747993350029, "learning_rate": 0.001, "loss": 1.8471, "step": 629600 }, { "epoch": 36.83102298648886, "grad_norm": 0.20056027173995972, "learning_rate": 0.001, "loss": 1.852, "step": 629700 }, { "epoch": 36.83687196584196, "grad_norm": 0.20251628756523132, "learning_rate": 0.001, "loss": 1.8491, "step": 629800 }, { "epoch": 36.84272094519506, "grad_norm": 0.2092164009809494, "learning_rate": 0.001, "loss": 1.8486, "step": 629900 }, { "epoch": 36.848569924548165, "grad_norm": 0.18370075523853302, "learning_rate": 0.001, "loss": 1.8488, "step": 630000 }, { "epoch": 36.85441890390127, "grad_norm": 0.15486058592796326, "learning_rate": 0.001, "loss": 1.8469, "step": 630100 }, { "epoch": 36.86026788325437, "grad_norm": 0.18193404376506805, "learning_rate": 0.001, "loss": 1.843, "step": 630200 }, { "epoch": 36.86611686260748, "grad_norm": 0.18038693070411682, "learning_rate": 0.001, "loss": 1.8521, "step": 630300 }, { "epoch": 36.87196584196058, "grad_norm": 0.16898930072784424, "learning_rate": 0.001, "loss": 1.8447, "step": 630400 }, { "epoch": 36.87781482131368, "grad_norm": 0.21192620694637299, "learning_rate": 0.001, "loss": 1.8478, "step": 630500 }, { "epoch": 36.88366380066678, "grad_norm": 0.2091151475906372, "learning_rate": 0.001, "loss": 1.8503, "step": 630600 }, { "epoch": 36.889512780019885, "grad_norm": 0.24645225703716278, "learning_rate": 0.001, "loss": 1.8511, "step": 630700 }, { "epoch": 36.89536175937299, "grad_norm": 0.1886325180530548, "learning_rate": 0.001, "loss": 1.8467, "step": 630800 }, { "epoch": 36.90121073872609, "grad_norm": 0.1997394859790802, "learning_rate": 0.001, "loss": 1.8503, "step": 630900 }, { "epoch": 36.907059718079196, "grad_norm": 0.21052312850952148, "learning_rate": 0.001, "loss": 1.8453, "step": 631000 }, { "epoch": 36.9129086974323, "grad_norm": 0.22150112688541412, "learning_rate": 0.001, "loss": 1.8493, "step": 631100 }, { "epoch": 36.918757676785404, "grad_norm": 0.22056353092193604, "learning_rate": 0.001, "loss": 1.855, "step": 631200 }, { "epoch": 36.9246066561385, "grad_norm": 0.19683144986629486, "learning_rate": 0.001, "loss": 1.8519, "step": 631300 }, { "epoch": 36.930455635491604, "grad_norm": 0.19889794290065765, "learning_rate": 0.001, "loss": 1.8468, "step": 631400 }, { "epoch": 36.93630461484471, "grad_norm": 0.15301547944545746, "learning_rate": 0.001, "loss": 1.8487, "step": 631500 }, { "epoch": 36.94215359419781, "grad_norm": 0.23520636558532715, "learning_rate": 0.001, "loss": 1.8502, "step": 631600 }, { "epoch": 36.948002573550916, "grad_norm": 0.1431807279586792, "learning_rate": 0.001, "loss": 1.8487, "step": 631700 }, { "epoch": 36.95385155290402, "grad_norm": 0.1894932985305786, "learning_rate": 0.001, "loss": 1.8471, "step": 631800 }, { "epoch": 36.95970053225712, "grad_norm": 0.2160719931125641, "learning_rate": 0.001, "loss": 1.8509, "step": 631900 }, { "epoch": 36.96554951161023, "grad_norm": 0.3522154688835144, "learning_rate": 0.001, "loss": 1.8459, "step": 632000 }, { "epoch": 36.971398490963324, "grad_norm": 0.24693959951400757, "learning_rate": 0.001, "loss": 1.8514, "step": 632100 }, { "epoch": 36.97724747031643, "grad_norm": 0.17627832293510437, "learning_rate": 0.001, "loss": 1.8523, "step": 632200 }, { "epoch": 36.98309644966953, "grad_norm": 0.16821442544460297, "learning_rate": 0.001, "loss": 1.8528, "step": 632300 }, { "epoch": 36.988945429022635, "grad_norm": 0.17184047400951385, "learning_rate": 0.001, "loss": 1.8496, "step": 632400 }, { "epoch": 36.99479440837574, "grad_norm": 0.23408189415931702, "learning_rate": 0.001, "loss": 1.8495, "step": 632500 }, { "epoch": 37.00064338772884, "grad_norm": 0.20705747604370117, "learning_rate": 0.001, "loss": 1.8486, "step": 632600 }, { "epoch": 37.00649236708195, "grad_norm": 0.23840096592903137, "learning_rate": 0.001, "loss": 1.8336, "step": 632700 }, { "epoch": 37.01234134643505, "grad_norm": 0.2293562889099121, "learning_rate": 0.001, "loss": 1.8376, "step": 632800 }, { "epoch": 37.01819032578815, "grad_norm": 0.18624138832092285, "learning_rate": 0.001, "loss": 1.8381, "step": 632900 }, { "epoch": 37.02403930514125, "grad_norm": 0.21306701004505157, "learning_rate": 0.001, "loss": 1.8334, "step": 633000 }, { "epoch": 37.029888284494355, "grad_norm": 0.20190899074077606, "learning_rate": 0.001, "loss": 1.8357, "step": 633100 }, { "epoch": 37.03573726384746, "grad_norm": 0.2675846815109253, "learning_rate": 0.001, "loss": 1.8367, "step": 633200 }, { "epoch": 37.04158624320056, "grad_norm": 0.2019842118024826, "learning_rate": 0.001, "loss": 1.8334, "step": 633300 }, { "epoch": 37.047435222553666, "grad_norm": 0.26233741641044617, "learning_rate": 0.001, "loss": 1.8364, "step": 633400 }, { "epoch": 37.05328420190677, "grad_norm": 0.1929246336221695, "learning_rate": 0.001, "loss": 1.8381, "step": 633500 }, { "epoch": 37.05913318125987, "grad_norm": 0.20851865410804749, "learning_rate": 0.001, "loss": 1.8366, "step": 633600 }, { "epoch": 37.06498216061297, "grad_norm": 0.22136984765529633, "learning_rate": 0.001, "loss": 1.8323, "step": 633700 }, { "epoch": 37.070831139966074, "grad_norm": 0.2343042492866516, "learning_rate": 0.001, "loss": 1.8372, "step": 633800 }, { "epoch": 37.07668011931918, "grad_norm": 0.21828505396842957, "learning_rate": 0.001, "loss": 1.8368, "step": 633900 }, { "epoch": 37.08252909867228, "grad_norm": 0.20386189222335815, "learning_rate": 0.001, "loss": 1.8407, "step": 634000 }, { "epoch": 37.088378078025386, "grad_norm": 0.22413362562656403, "learning_rate": 0.001, "loss": 1.83, "step": 634100 }, { "epoch": 37.09422705737849, "grad_norm": 0.3648683428764343, "learning_rate": 0.001, "loss": 1.839, "step": 634200 }, { "epoch": 37.10007603673159, "grad_norm": 0.29289352893829346, "learning_rate": 0.001, "loss": 1.8337, "step": 634300 }, { "epoch": 37.10592501608469, "grad_norm": 0.17257773876190186, "learning_rate": 0.001, "loss": 1.8363, "step": 634400 }, { "epoch": 37.111773995437794, "grad_norm": 0.1988716423511505, "learning_rate": 0.001, "loss": 1.8362, "step": 634500 }, { "epoch": 37.1176229747909, "grad_norm": 0.20630612969398499, "learning_rate": 0.001, "loss": 1.8345, "step": 634600 }, { "epoch": 37.123471954144, "grad_norm": 0.17031772434711456, "learning_rate": 0.001, "loss": 1.8373, "step": 634700 }, { "epoch": 37.129320933497105, "grad_norm": 0.2258094996213913, "learning_rate": 0.001, "loss": 1.8372, "step": 634800 }, { "epoch": 37.13516991285021, "grad_norm": 0.22922681272029877, "learning_rate": 0.001, "loss": 1.8351, "step": 634900 }, { "epoch": 37.14101889220331, "grad_norm": 0.23424725234508514, "learning_rate": 0.001, "loss": 1.8406, "step": 635000 }, { "epoch": 37.14686787155642, "grad_norm": 0.20615576207637787, "learning_rate": 0.001, "loss": 1.8395, "step": 635100 }, { "epoch": 37.15271685090951, "grad_norm": 0.21216697990894318, "learning_rate": 0.001, "loss": 1.8396, "step": 635200 }, { "epoch": 37.15856583026262, "grad_norm": 0.2235933244228363, "learning_rate": 0.001, "loss": 1.8336, "step": 635300 }, { "epoch": 37.16441480961572, "grad_norm": 0.23039934039115906, "learning_rate": 0.001, "loss": 1.8373, "step": 635400 }, { "epoch": 37.170263788968825, "grad_norm": 0.2456943243741989, "learning_rate": 0.001, "loss": 1.8386, "step": 635500 }, { "epoch": 37.17611276832193, "grad_norm": 0.21950982511043549, "learning_rate": 0.001, "loss": 1.8404, "step": 635600 }, { "epoch": 37.18196174767503, "grad_norm": 0.19221068918704987, "learning_rate": 0.001, "loss": 1.8354, "step": 635700 }, { "epoch": 37.187810727028136, "grad_norm": 0.3007580041885376, "learning_rate": 0.001, "loss": 1.8425, "step": 635800 }, { "epoch": 37.19365970638124, "grad_norm": 0.17551778256893158, "learning_rate": 0.001, "loss": 1.842, "step": 635900 }, { "epoch": 37.19950868573434, "grad_norm": 0.22524593770503998, "learning_rate": 0.001, "loss": 1.8462, "step": 636000 }, { "epoch": 37.20535766508744, "grad_norm": 0.19183915853500366, "learning_rate": 0.001, "loss": 1.8417, "step": 636100 }, { "epoch": 37.211206644440544, "grad_norm": 0.25869277119636536, "learning_rate": 0.001, "loss": 1.8345, "step": 636200 }, { "epoch": 37.21705562379365, "grad_norm": 0.18566305935382843, "learning_rate": 0.001, "loss": 1.8399, "step": 636300 }, { "epoch": 37.22290460314675, "grad_norm": 0.198575958609581, "learning_rate": 0.001, "loss": 1.8376, "step": 636400 }, { "epoch": 37.228753582499856, "grad_norm": 0.18800272047519684, "learning_rate": 0.001, "loss": 1.84, "step": 636500 }, { "epoch": 37.23460256185296, "grad_norm": 0.19824561476707458, "learning_rate": 0.001, "loss": 1.8393, "step": 636600 }, { "epoch": 37.240451541206056, "grad_norm": 0.23149332404136658, "learning_rate": 0.001, "loss": 1.8385, "step": 636700 }, { "epoch": 37.24630052055916, "grad_norm": 0.18983334302902222, "learning_rate": 0.001, "loss": 1.8423, "step": 636800 }, { "epoch": 37.252149499912264, "grad_norm": 0.2284231334924698, "learning_rate": 0.001, "loss": 1.84, "step": 636900 }, { "epoch": 37.25799847926537, "grad_norm": 0.1912277489900589, "learning_rate": 0.001, "loss": 1.8404, "step": 637000 }, { "epoch": 37.26384745861847, "grad_norm": 0.17713157832622528, "learning_rate": 0.001, "loss": 1.8399, "step": 637100 }, { "epoch": 37.269696437971575, "grad_norm": 0.2139890193939209, "learning_rate": 0.001, "loss": 1.8378, "step": 637200 }, { "epoch": 37.27554541732468, "grad_norm": 0.2314668595790863, "learning_rate": 0.001, "loss": 1.8357, "step": 637300 }, { "epoch": 37.28139439667778, "grad_norm": 0.2247253805398941, "learning_rate": 0.001, "loss": 1.8396, "step": 637400 }, { "epoch": 37.28724337603088, "grad_norm": 0.2059551179409027, "learning_rate": 0.001, "loss": 1.8405, "step": 637500 }, { "epoch": 37.29309235538398, "grad_norm": 0.23305796086788177, "learning_rate": 0.001, "loss": 1.8414, "step": 637600 }, { "epoch": 37.29894133473709, "grad_norm": 0.2110508531332016, "learning_rate": 0.001, "loss": 1.8368, "step": 637700 }, { "epoch": 37.30479031409019, "grad_norm": 0.2642911374568939, "learning_rate": 0.001, "loss": 1.8456, "step": 637800 }, { "epoch": 37.310639293443295, "grad_norm": 0.21702654659748077, "learning_rate": 0.001, "loss": 1.8408, "step": 637900 }, { "epoch": 37.3164882727964, "grad_norm": 0.21587549149990082, "learning_rate": 0.001, "loss": 1.8471, "step": 638000 }, { "epoch": 37.3223372521495, "grad_norm": 0.19994819164276123, "learning_rate": 0.001, "loss": 1.8421, "step": 638100 }, { "epoch": 37.328186231502606, "grad_norm": 0.20551423728466034, "learning_rate": 0.001, "loss": 1.8422, "step": 638200 }, { "epoch": 37.3340352108557, "grad_norm": 0.24357828497886658, "learning_rate": 0.001, "loss": 1.8412, "step": 638300 }, { "epoch": 37.33988419020881, "grad_norm": 0.2230047732591629, "learning_rate": 0.001, "loss": 1.8417, "step": 638400 }, { "epoch": 37.34573316956191, "grad_norm": 0.25165459513664246, "learning_rate": 0.001, "loss": 1.8446, "step": 638500 }, { "epoch": 37.351582148915014, "grad_norm": 0.2113816887140274, "learning_rate": 0.001, "loss": 1.845, "step": 638600 }, { "epoch": 37.35743112826812, "grad_norm": 0.2605915367603302, "learning_rate": 0.001, "loss": 1.8401, "step": 638700 }, { "epoch": 37.36328010762122, "grad_norm": 0.19100628793239594, "learning_rate": 0.001, "loss": 1.8412, "step": 638800 }, { "epoch": 37.369129086974326, "grad_norm": 0.21365544199943542, "learning_rate": 0.001, "loss": 1.8453, "step": 638900 }, { "epoch": 37.37497806632743, "grad_norm": 0.23477454483509064, "learning_rate": 0.001, "loss": 1.8422, "step": 639000 }, { "epoch": 37.380827045680526, "grad_norm": 0.1670394390821457, "learning_rate": 0.001, "loss": 1.8428, "step": 639100 }, { "epoch": 37.38667602503363, "grad_norm": 0.19631905853748322, "learning_rate": 0.001, "loss": 1.8475, "step": 639200 }, { "epoch": 37.392525004386734, "grad_norm": 0.25034329295158386, "learning_rate": 0.001, "loss": 1.8432, "step": 639300 }, { "epoch": 37.39837398373984, "grad_norm": 0.17355339229106903, "learning_rate": 0.001, "loss": 1.8416, "step": 639400 }, { "epoch": 37.40422296309294, "grad_norm": 0.21937303245067596, "learning_rate": 0.001, "loss": 1.8448, "step": 639500 }, { "epoch": 37.410071942446045, "grad_norm": 0.23474842309951782, "learning_rate": 0.001, "loss": 1.8462, "step": 639600 }, { "epoch": 37.41592092179915, "grad_norm": 0.19959691166877747, "learning_rate": 0.001, "loss": 1.8438, "step": 639700 }, { "epoch": 37.421769901152246, "grad_norm": 0.27831628918647766, "learning_rate": 0.001, "loss": 1.8422, "step": 639800 }, { "epoch": 37.42761888050535, "grad_norm": 0.2775475084781647, "learning_rate": 0.001, "loss": 1.8426, "step": 639900 }, { "epoch": 37.43346785985845, "grad_norm": 0.24112673103809357, "learning_rate": 0.001, "loss": 1.8441, "step": 640000 }, { "epoch": 37.43931683921156, "grad_norm": 0.21791619062423706, "learning_rate": 0.001, "loss": 1.8377, "step": 640100 }, { "epoch": 37.44516581856466, "grad_norm": 0.2257019281387329, "learning_rate": 0.001, "loss": 1.8433, "step": 640200 }, { "epoch": 37.451014797917765, "grad_norm": 0.18768809735774994, "learning_rate": 0.001, "loss": 1.845, "step": 640300 }, { "epoch": 37.45686377727087, "grad_norm": 0.2460939735174179, "learning_rate": 0.001, "loss": 1.8427, "step": 640400 }, { "epoch": 37.46271275662397, "grad_norm": 0.17159759998321533, "learning_rate": 0.001, "loss": 1.8448, "step": 640500 }, { "epoch": 37.46856173597707, "grad_norm": 0.1792740374803543, "learning_rate": 0.001, "loss": 1.8481, "step": 640600 }, { "epoch": 37.47441071533017, "grad_norm": 0.21354375779628754, "learning_rate": 0.001, "loss": 1.8432, "step": 640700 }, { "epoch": 37.48025969468328, "grad_norm": 0.20588774979114532, "learning_rate": 0.001, "loss": 1.8395, "step": 640800 }, { "epoch": 37.48610867403638, "grad_norm": 0.2425035536289215, "learning_rate": 0.001, "loss": 1.8401, "step": 640900 }, { "epoch": 37.491957653389484, "grad_norm": 0.18409140408039093, "learning_rate": 0.001, "loss": 1.8467, "step": 641000 }, { "epoch": 37.49780663274259, "grad_norm": 0.20562149584293365, "learning_rate": 0.001, "loss": 1.8373, "step": 641100 }, { "epoch": 37.50365561209569, "grad_norm": 0.2101922184228897, "learning_rate": 0.001, "loss": 1.8451, "step": 641200 }, { "epoch": 37.509504591448795, "grad_norm": 0.33141592144966125, "learning_rate": 0.001, "loss": 1.8476, "step": 641300 }, { "epoch": 37.51535357080189, "grad_norm": 0.21630273759365082, "learning_rate": 0.001, "loss": 1.8389, "step": 641400 }, { "epoch": 37.521202550154996, "grad_norm": 0.21298958361148834, "learning_rate": 0.001, "loss": 1.8461, "step": 641500 }, { "epoch": 37.5270515295081, "grad_norm": 0.2328283041715622, "learning_rate": 0.001, "loss": 1.848, "step": 641600 }, { "epoch": 37.5329005088612, "grad_norm": 0.20401349663734436, "learning_rate": 0.001, "loss": 1.8427, "step": 641700 }, { "epoch": 37.53874948821431, "grad_norm": 0.1883656233549118, "learning_rate": 0.001, "loss": 1.847, "step": 641800 }, { "epoch": 37.54459846756741, "grad_norm": 0.22812439501285553, "learning_rate": 0.001, "loss": 1.8416, "step": 641900 }, { "epoch": 37.550447446920515, "grad_norm": 0.20589034259319305, "learning_rate": 0.001, "loss": 1.838, "step": 642000 }, { "epoch": 37.55629642627362, "grad_norm": 0.2984786033630371, "learning_rate": 0.001, "loss": 1.8458, "step": 642100 }, { "epoch": 37.562145405626715, "grad_norm": 0.20201000571250916, "learning_rate": 0.001, "loss": 1.8412, "step": 642200 }, { "epoch": 37.56799438497982, "grad_norm": 0.24541044235229492, "learning_rate": 0.001, "loss": 1.8459, "step": 642300 }, { "epoch": 37.57384336433292, "grad_norm": 0.25108322501182556, "learning_rate": 0.001, "loss": 1.8458, "step": 642400 }, { "epoch": 37.57969234368603, "grad_norm": 0.18537987768650055, "learning_rate": 0.001, "loss": 1.8404, "step": 642500 }, { "epoch": 37.58554132303913, "grad_norm": 0.19340495765209198, "learning_rate": 0.001, "loss": 1.8428, "step": 642600 }, { "epoch": 37.591390302392234, "grad_norm": 0.20137587189674377, "learning_rate": 0.001, "loss": 1.8422, "step": 642700 }, { "epoch": 37.59723928174534, "grad_norm": 0.24427206814289093, "learning_rate": 0.001, "loss": 1.8424, "step": 642800 }, { "epoch": 37.603088261098435, "grad_norm": 0.2232440561056137, "learning_rate": 0.001, "loss": 1.8451, "step": 642900 }, { "epoch": 37.60893724045154, "grad_norm": 0.19838635623455048, "learning_rate": 0.001, "loss": 1.8512, "step": 643000 }, { "epoch": 37.61478621980464, "grad_norm": 0.20751744508743286, "learning_rate": 0.001, "loss": 1.8397, "step": 643100 }, { "epoch": 37.620635199157746, "grad_norm": 0.19123943150043488, "learning_rate": 0.001, "loss": 1.8377, "step": 643200 }, { "epoch": 37.62648417851085, "grad_norm": 0.2787471413612366, "learning_rate": 0.001, "loss": 1.8443, "step": 643300 }, { "epoch": 37.632333157863954, "grad_norm": 0.1968458592891693, "learning_rate": 0.001, "loss": 1.8468, "step": 643400 }, { "epoch": 37.63818213721706, "grad_norm": 0.24415534734725952, "learning_rate": 0.001, "loss": 1.8468, "step": 643500 }, { "epoch": 37.64403111657016, "grad_norm": 0.21697241067886353, "learning_rate": 0.001, "loss": 1.8394, "step": 643600 }, { "epoch": 37.64988009592326, "grad_norm": 0.24984246492385864, "learning_rate": 0.001, "loss": 1.8411, "step": 643700 }, { "epoch": 37.65572907527636, "grad_norm": 0.2653696537017822, "learning_rate": 0.001, "loss": 1.8414, "step": 643800 }, { "epoch": 37.661578054629466, "grad_norm": 0.1875111311674118, "learning_rate": 0.001, "loss": 1.8471, "step": 643900 }, { "epoch": 37.66742703398257, "grad_norm": 0.16487501561641693, "learning_rate": 0.001, "loss": 1.8403, "step": 644000 }, { "epoch": 37.67327601333567, "grad_norm": 0.1790855973958969, "learning_rate": 0.001, "loss": 1.8458, "step": 644100 }, { "epoch": 37.67912499268878, "grad_norm": 0.22987647354602814, "learning_rate": 0.001, "loss": 1.846, "step": 644200 }, { "epoch": 37.68497397204188, "grad_norm": 0.20893296599388123, "learning_rate": 0.001, "loss": 1.8487, "step": 644300 }, { "epoch": 37.690822951394985, "grad_norm": 0.24345028400421143, "learning_rate": 0.001, "loss": 1.843, "step": 644400 }, { "epoch": 37.69667193074808, "grad_norm": 0.29733771085739136, "learning_rate": 0.001, "loss": 1.8532, "step": 644500 }, { "epoch": 37.702520910101185, "grad_norm": 0.18534554541110992, "learning_rate": 0.001, "loss": 1.8441, "step": 644600 }, { "epoch": 37.70836988945429, "grad_norm": 0.3095342814922333, "learning_rate": 0.001, "loss": 1.8438, "step": 644700 }, { "epoch": 37.71421886880739, "grad_norm": 0.21777890622615814, "learning_rate": 0.001, "loss": 1.8467, "step": 644800 }, { "epoch": 37.7200678481605, "grad_norm": 0.23610977828502655, "learning_rate": 0.001, "loss": 1.8402, "step": 644900 }, { "epoch": 37.7259168275136, "grad_norm": 0.22179485857486725, "learning_rate": 0.001, "loss": 1.8425, "step": 645000 }, { "epoch": 37.731765806866704, "grad_norm": 0.18963855504989624, "learning_rate": 0.001, "loss": 1.8446, "step": 645100 }, { "epoch": 37.73761478621981, "grad_norm": 0.2255353480577469, "learning_rate": 0.001, "loss": 1.842, "step": 645200 }, { "epoch": 37.743463765572905, "grad_norm": 0.27601903676986694, "learning_rate": 0.001, "loss": 1.8488, "step": 645300 }, { "epoch": 37.74931274492601, "grad_norm": 0.23124228417873383, "learning_rate": 0.001, "loss": 1.8509, "step": 645400 }, { "epoch": 37.75516172427911, "grad_norm": 0.2038297951221466, "learning_rate": 0.001, "loss": 1.8446, "step": 645500 }, { "epoch": 37.761010703632216, "grad_norm": 0.27538010478019714, "learning_rate": 0.001, "loss": 1.8423, "step": 645600 }, { "epoch": 37.76685968298532, "grad_norm": 0.29155880212783813, "learning_rate": 0.001, "loss": 1.8506, "step": 645700 }, { "epoch": 37.772708662338424, "grad_norm": 0.24788300693035126, "learning_rate": 0.001, "loss": 1.8479, "step": 645800 }, { "epoch": 37.77855764169153, "grad_norm": 0.23231224715709686, "learning_rate": 0.001, "loss": 1.8448, "step": 645900 }, { "epoch": 37.784406621044624, "grad_norm": 0.19679366052150726, "learning_rate": 0.001, "loss": 1.8447, "step": 646000 }, { "epoch": 37.79025560039773, "grad_norm": 0.19301532208919525, "learning_rate": 0.001, "loss": 1.8428, "step": 646100 }, { "epoch": 37.79610457975083, "grad_norm": 0.21716441214084625, "learning_rate": 0.001, "loss": 1.8488, "step": 646200 }, { "epoch": 37.801953559103936, "grad_norm": 0.21621347963809967, "learning_rate": 0.001, "loss": 1.8484, "step": 646300 }, { "epoch": 37.80780253845704, "grad_norm": 0.2368260622024536, "learning_rate": 0.001, "loss": 1.8526, "step": 646400 }, { "epoch": 37.81365151781014, "grad_norm": 0.22770600020885468, "learning_rate": 0.001, "loss": 1.8455, "step": 646500 }, { "epoch": 37.81950049716325, "grad_norm": 0.2397926151752472, "learning_rate": 0.001, "loss": 1.8453, "step": 646600 }, { "epoch": 37.82534947651635, "grad_norm": 0.1955961287021637, "learning_rate": 0.001, "loss": 1.8453, "step": 646700 }, { "epoch": 37.83119845586945, "grad_norm": 0.22295121848583221, "learning_rate": 0.001, "loss": 1.8425, "step": 646800 }, { "epoch": 37.83704743522255, "grad_norm": 0.28126460313796997, "learning_rate": 0.001, "loss": 1.8433, "step": 646900 }, { "epoch": 37.842896414575655, "grad_norm": 0.21830415725708008, "learning_rate": 0.001, "loss": 1.8501, "step": 647000 }, { "epoch": 37.84874539392876, "grad_norm": 0.23315127193927765, "learning_rate": 0.001, "loss": 1.847, "step": 647100 }, { "epoch": 37.85459437328186, "grad_norm": 0.18374694883823395, "learning_rate": 0.001, "loss": 1.8415, "step": 647200 }, { "epoch": 37.86044335263497, "grad_norm": 0.17268627882003784, "learning_rate": 0.001, "loss": 1.8434, "step": 647300 }, { "epoch": 37.86629233198807, "grad_norm": 0.20803464949131012, "learning_rate": 0.001, "loss": 1.8453, "step": 647400 }, { "epoch": 37.872141311341174, "grad_norm": 0.20693086087703705, "learning_rate": 0.001, "loss": 1.8458, "step": 647500 }, { "epoch": 37.87799029069427, "grad_norm": 0.21429604291915894, "learning_rate": 0.001, "loss": 1.8461, "step": 647600 }, { "epoch": 37.883839270047375, "grad_norm": 0.2526037395000458, "learning_rate": 0.001, "loss": 1.8444, "step": 647700 }, { "epoch": 37.88968824940048, "grad_norm": 0.18953204154968262, "learning_rate": 0.001, "loss": 1.8407, "step": 647800 }, { "epoch": 37.89553722875358, "grad_norm": 0.22938525676727295, "learning_rate": 0.001, "loss": 1.8454, "step": 647900 }, { "epoch": 37.901386208106686, "grad_norm": 0.19667810201644897, "learning_rate": 0.001, "loss": 1.8489, "step": 648000 }, { "epoch": 37.90723518745979, "grad_norm": 0.24558916687965393, "learning_rate": 0.001, "loss": 1.8427, "step": 648100 }, { "epoch": 37.913084166812894, "grad_norm": 0.21874067187309265, "learning_rate": 0.001, "loss": 1.846, "step": 648200 }, { "epoch": 37.918933146166, "grad_norm": 0.19958099722862244, "learning_rate": 0.001, "loss": 1.844, "step": 648300 }, { "epoch": 37.924782125519094, "grad_norm": 0.28501278162002563, "learning_rate": 0.001, "loss": 1.8495, "step": 648400 }, { "epoch": 37.9306311048722, "grad_norm": 0.21076396107673645, "learning_rate": 0.001, "loss": 1.8493, "step": 648500 }, { "epoch": 37.9364800842253, "grad_norm": 0.21368733048439026, "learning_rate": 0.001, "loss": 1.8441, "step": 648600 }, { "epoch": 37.942329063578406, "grad_norm": 0.21276850998401642, "learning_rate": 0.001, "loss": 1.8418, "step": 648700 }, { "epoch": 37.94817804293151, "grad_norm": 0.22714678943157196, "learning_rate": 0.001, "loss": 1.8462, "step": 648800 }, { "epoch": 37.95402702228461, "grad_norm": 0.16968989372253418, "learning_rate": 0.001, "loss": 1.8511, "step": 648900 }, { "epoch": 37.95987600163772, "grad_norm": 0.2230595350265503, "learning_rate": 0.001, "loss": 1.8441, "step": 649000 }, { "epoch": 37.965724980990814, "grad_norm": 0.23879601061344147, "learning_rate": 0.001, "loss": 1.8455, "step": 649100 }, { "epoch": 37.97157396034392, "grad_norm": 0.24322447180747986, "learning_rate": 0.001, "loss": 1.8476, "step": 649200 }, { "epoch": 37.97742293969702, "grad_norm": 0.17641042172908783, "learning_rate": 0.001, "loss": 1.8413, "step": 649300 }, { "epoch": 37.983271919050125, "grad_norm": 0.23862943053245544, "learning_rate": 0.001, "loss": 1.8509, "step": 649400 }, { "epoch": 37.98912089840323, "grad_norm": 0.1868886500597, "learning_rate": 0.001, "loss": 1.847, "step": 649500 }, { "epoch": 37.99496987775633, "grad_norm": 0.22073547542095184, "learning_rate": 0.001, "loss": 1.8378, "step": 649600 }, { "epoch": 38.00081885710944, "grad_norm": 0.18446919322013855, "learning_rate": 0.001, "loss": 1.8422, "step": 649700 }, { "epoch": 38.00666783646254, "grad_norm": 0.2154252827167511, "learning_rate": 0.001, "loss": 1.823, "step": 649800 }, { "epoch": 38.01251681581564, "grad_norm": 0.15971973538398743, "learning_rate": 0.001, "loss": 1.8296, "step": 649900 }, { "epoch": 38.01836579516874, "grad_norm": 0.20659615099430084, "learning_rate": 0.001, "loss": 1.8364, "step": 650000 }, { "epoch": 38.01836579516874, "eval_ag_news_accuracy": 0.239578125, "eval_ag_news_bleu_score": 8.190626072294032, "eval_ag_news_bleu_score_sem": 0.7486187011032933, "eval_ag_news_emb_cos_sim": 0.7291132211685181, "eval_ag_news_emb_cos_sim_sem": 0.0156043516471982, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.715895414352417, "eval_ag_news_n_ngrams_match_1": 13.84375, "eval_ag_news_n_ngrams_match_2": 4.3359375, "eval_ag_news_n_ngrams_match_3": 1.8125, "eval_ag_news_num_pred_words": 44.90625, "eval_ag_news_num_true_words": 43.859375, "eval_ag_news_perplexity": 15.118141013537057, "eval_ag_news_pred_num_tokens": 67.6328125, "eval_ag_news_rouge_score": 0.31386253430947914, "eval_ag_news_runtime": 36.2312, "eval_ag_news_samples_per_second": 13.8, "eval_ag_news_steps_per_second": 0.028, "eval_ag_news_token_set_f1": 0.34695067660394474, "eval_ag_news_token_set_f1_sem": 0.011535291181878616, "eval_ag_news_token_set_precision": 0.32721159288543844, "eval_ag_news_token_set_recall": 0.3815372248308525, "eval_ag_news_true_num_tokens": 61.3671875, "step": 650000 }, { "epoch": 38.01836579516874, "eval_anthropic_toxic_prompts_accuracy": 0.102640625, "eval_anthropic_toxic_prompts_bleu_score": 42.32560793202581, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.609723718242431, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8799391984939575, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011129030957818031, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1328125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030114393430435732, "eval_anthropic_toxic_prompts_loss": 1.2517701387405396, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.6171875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.8203125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.8828125, "eval_anthropic_toxic_prompts_num_pred_words": 15.9453125, "eval_anthropic_toxic_prompts_num_true_words": 15.3515625, "eval_anthropic_toxic_prompts_perplexity": 3.4965268202814808, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.859375, "eval_anthropic_toxic_prompts_rouge_score": 0.6845605398537554, "eval_anthropic_toxic_prompts_runtime": 29.1434, "eval_anthropic_toxic_prompts_samples_per_second": 17.157, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.7045997401134696, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018298124499537684, "eval_anthropic_toxic_prompts_token_set_precision": 0.7051660181578802, "eval_anthropic_toxic_prompts_token_set_recall": 0.7082696333845352, "eval_anthropic_toxic_prompts_true_num_tokens": 19.2109375, "step": 650000 }, { "epoch": 38.01836579516874, "eval_arxiv_accuracy": 0.376390625, "eval_arxiv_bleu_score": 1.838070387655347, "eval_arxiv_bleu_score_sem": 0.15426121004319268, "eval_arxiv_emb_cos_sim": 0.48120731115341187, "eval_arxiv_emb_cos_sim_sem": 0.0193129051476717, "eval_arxiv_emb_top1_equal": 0.953125, "eval_arxiv_emb_top1_equal_sem": 0.01875615119934082, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4291043281555176, "eval_arxiv_n_ngrams_match_1": 14.1328125, "eval_arxiv_n_ngrams_match_2": 2.484375, "eval_arxiv_n_ngrams_match_3": 0.484375, "eval_arxiv_num_pred_words": 56.0703125, "eval_arxiv_num_true_words": 85.984375, "eval_arxiv_perplexity": 30.84899979147169, "eval_arxiv_pred_num_tokens": 125.6015625, "eval_arxiv_rouge_score": 0.1844037939715169, "eval_arxiv_runtime": 30.3331, "eval_arxiv_samples_per_second": 16.484, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.18265603120141097, "eval_arxiv_token_set_f1_sem": 0.008338080847050523, "eval_arxiv_token_set_precision": 0.12589014594258535, "eval_arxiv_token_set_recall": 0.40394795143012047, "eval_arxiv_true_num_tokens": 125.6171875, "step": 650000 }, { "epoch": 38.01836579516874, "eval_python_code_alpaca_accuracy": 0.128328125, "eval_python_code_alpaca_bleu_score": 30.58092498824295, "eval_python_code_alpaca_bleu_score_sem": 1.7597427369445093, "eval_python_code_alpaca_emb_cos_sim": 0.8737627863883972, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009457619860768318, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4599500894546509, "eval_python_code_alpaca_n_ngrams_match_1": 10.03125, "eval_python_code_alpaca_n_ngrams_match_2": 5.75, "eval_python_code_alpaca_n_ngrams_match_3": 3.4375, "eval_python_code_alpaca_num_pred_words": 16.9453125, "eval_python_code_alpaca_num_true_words": 18.0234375, "eval_python_code_alpaca_perplexity": 4.305744620920013, "eval_python_code_alpaca_pred_num_tokens": 22.9453125, "eval_python_code_alpaca_rouge_score": 0.5982433968975723, "eval_python_code_alpaca_runtime": 29.8526, "eval_python_code_alpaca_samples_per_second": 16.749, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6145766962063292, "eval_python_code_alpaca_token_set_f1_sem": 0.014081408813892544, "eval_python_code_alpaca_token_set_precision": 0.6017928948083272, "eval_python_code_alpaca_token_set_recall": 0.6344275292550372, "eval_python_code_alpaca_true_num_tokens": 23.703125, "step": 650000 }, { "epoch": 38.01836579516874, "eval_wikibio_accuracy": 0.3664375, "eval_wikibio_bleu_score": 7.225308126513714, "eval_wikibio_bleu_score_sem": 0.6952527447358428, "eval_wikibio_emb_cos_sim": 0.5915143489837646, "eval_wikibio_emb_cos_sim_sem": 0.022699031978845596, "eval_wikibio_emb_top1_equal": 0.921875, "eval_wikibio_emb_top1_equal_sem": 0.023813825100660324, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7136971950531006, "eval_wikibio_n_ngrams_match_1": 15.3828125, "eval_wikibio_n_ngrams_match_2": 5.328125, "eval_wikibio_n_ngrams_match_3": 2.203125, "eval_wikibio_num_pred_words": 52.875, "eval_wikibio_num_true_words": 54.0625, "eval_wikibio_perplexity": 15.084944524140727, "eval_wikibio_pred_num_tokens": 108.875, "eval_wikibio_rouge_score": 0.2920358351794189, "eval_wikibio_runtime": 32.1808, "eval_wikibio_samples_per_second": 15.537, "eval_wikibio_steps_per_second": 0.031, "eval_wikibio_token_set_f1": 0.3205990295098915, "eval_wikibio_token_set_f1_sem": 0.01232324786090226, "eval_wikibio_token_set_precision": 0.283619465545432, "eval_wikibio_token_set_recall": 0.4151908205290844, "eval_wikibio_true_num_tokens": 103.4375, "step": 650000 }, { "epoch": 38.01836579516874, "eval_msmarco_accuracy": 0.39140625, "eval_msmarco_bleu_score": 18.393359363264576, "eval_msmarco_bleu_score_sem": 1.5024133050538353, "eval_msmarco_emb_cos_sim": 0.8083353042602539, "eval_msmarco_emb_cos_sim_sem": 0.014916528947651386, "eval_msmarco_emb_top1_equal": 0.9296875, "eval_msmarco_emb_top1_equal_sem": 0.022687306627631187, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7385706901550293, "eval_msmarco_n_ngrams_match_1": 29.3359375, "eval_msmarco_n_ngrams_match_2": 13.8359375, "eval_msmarco_n_ngrams_match_3": 8.3125, "eval_msmarco_num_pred_words": 61.3359375, "eval_msmarco_num_true_words": 60.796875, "eval_msmarco_perplexity": 5.6892059704834015, "eval_msmarco_pred_num_tokens": 81.1015625, "eval_msmarco_rouge_score": 0.46053234567061346, "eval_msmarco_runtime": 25.0754, "eval_msmarco_samples_per_second": 19.94, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.48720324316010954, "eval_msmarco_token_set_f1_sem": 0.013077906324111847, "eval_msmarco_token_set_precision": 0.45756043918073325, "eval_msmarco_token_set_recall": 0.5343351779910172, "eval_msmarco_true_num_tokens": 78.5546875, "step": 650000 }, { "epoch": 38.024214774521845, "grad_norm": 0.2701948881149292, "learning_rate": 0.001, "loss": 1.8329, "step": 650100 }, { "epoch": 38.03006375387495, "grad_norm": 0.15724526345729828, "learning_rate": 0.001, "loss": 1.832, "step": 650200 }, { "epoch": 38.03591273322805, "grad_norm": 0.1873961240053177, "learning_rate": 0.001, "loss": 1.8302, "step": 650300 }, { "epoch": 38.041761712581156, "grad_norm": 0.21293553709983826, "learning_rate": 0.001, "loss": 1.8319, "step": 650400 }, { "epoch": 38.04761069193426, "grad_norm": 0.2209113985300064, "learning_rate": 0.001, "loss": 1.8335, "step": 650500 }, { "epoch": 38.053459671287364, "grad_norm": 0.20507501065731049, "learning_rate": 0.001, "loss": 1.8265, "step": 650600 }, { "epoch": 38.05930865064046, "grad_norm": 0.16117821633815765, "learning_rate": 0.001, "loss": 1.8336, "step": 650700 }, { "epoch": 38.065157629993564, "grad_norm": 0.14465515315532684, "learning_rate": 0.001, "loss": 1.8302, "step": 650800 }, { "epoch": 38.07100660934667, "grad_norm": 0.2595275342464447, "learning_rate": 0.001, "loss": 1.8292, "step": 650900 }, { "epoch": 38.07685558869977, "grad_norm": 0.20661669969558716, "learning_rate": 0.001, "loss": 1.8369, "step": 651000 }, { "epoch": 38.082704568052876, "grad_norm": 0.16010364890098572, "learning_rate": 0.001, "loss": 1.8275, "step": 651100 }, { "epoch": 38.08855354740598, "grad_norm": 0.14054450392723083, "learning_rate": 0.001, "loss": 1.8319, "step": 651200 }, { "epoch": 38.09440252675908, "grad_norm": 0.16118161380290985, "learning_rate": 0.001, "loss": 1.8271, "step": 651300 }, { "epoch": 38.10025150611219, "grad_norm": 0.20302002131938934, "learning_rate": 0.001, "loss": 1.8365, "step": 651400 }, { "epoch": 38.106100485465284, "grad_norm": 0.22252830862998962, "learning_rate": 0.001, "loss": 1.8386, "step": 651500 }, { "epoch": 38.11194946481839, "grad_norm": 0.22736521065235138, "learning_rate": 0.001, "loss": 1.8318, "step": 651600 }, { "epoch": 38.11779844417149, "grad_norm": 0.17269077897071838, "learning_rate": 0.001, "loss": 1.8371, "step": 651700 }, { "epoch": 38.123647423524595, "grad_norm": 0.18495555222034454, "learning_rate": 0.001, "loss": 1.8329, "step": 651800 }, { "epoch": 38.1294964028777, "grad_norm": 0.15725235641002655, "learning_rate": 0.001, "loss": 1.8279, "step": 651900 }, { "epoch": 38.1353453822308, "grad_norm": 0.21801286935806274, "learning_rate": 0.001, "loss": 1.8354, "step": 652000 }, { "epoch": 38.141194361583906, "grad_norm": 0.2302940934896469, "learning_rate": 0.001, "loss": 1.8422, "step": 652100 }, { "epoch": 38.147043340937, "grad_norm": 0.20533806085586548, "learning_rate": 0.001, "loss": 1.8363, "step": 652200 }, { "epoch": 38.15289232029011, "grad_norm": 0.21140600740909576, "learning_rate": 0.001, "loss": 1.8323, "step": 652300 }, { "epoch": 38.15874129964321, "grad_norm": 0.35073214769363403, "learning_rate": 0.001, "loss": 1.8343, "step": 652400 }, { "epoch": 38.164590278996315, "grad_norm": 0.2147001326084137, "learning_rate": 0.001, "loss": 1.8352, "step": 652500 }, { "epoch": 38.17043925834942, "grad_norm": 0.2047489583492279, "learning_rate": 0.001, "loss": 1.8364, "step": 652600 }, { "epoch": 38.17628823770252, "grad_norm": 0.23942831158638, "learning_rate": 0.001, "loss": 1.8398, "step": 652700 }, { "epoch": 38.182137217055626, "grad_norm": 0.14057281613349915, "learning_rate": 0.001, "loss": 1.8363, "step": 652800 }, { "epoch": 38.18798619640873, "grad_norm": 0.16108523309230804, "learning_rate": 0.001, "loss": 1.8294, "step": 652900 }, { "epoch": 38.193835175761826, "grad_norm": 0.21032586693763733, "learning_rate": 0.001, "loss": 1.8397, "step": 653000 }, { "epoch": 38.19968415511493, "grad_norm": 0.2548286020755768, "learning_rate": 0.001, "loss": 1.8432, "step": 653100 }, { "epoch": 38.205533134468034, "grad_norm": 0.20131617784500122, "learning_rate": 0.001, "loss": 1.8405, "step": 653200 }, { "epoch": 38.21138211382114, "grad_norm": 0.19964277744293213, "learning_rate": 0.001, "loss": 1.837, "step": 653300 }, { "epoch": 38.21723109317424, "grad_norm": 0.24170322716236115, "learning_rate": 0.001, "loss": 1.8343, "step": 653400 }, { "epoch": 38.223080072527345, "grad_norm": 0.16834087669849396, "learning_rate": 0.001, "loss": 1.8348, "step": 653500 }, { "epoch": 38.22892905188045, "grad_norm": 0.18962763249874115, "learning_rate": 0.001, "loss": 1.841, "step": 653600 }, { "epoch": 38.23477803123355, "grad_norm": 0.17201034724712372, "learning_rate": 0.001, "loss": 1.8368, "step": 653700 }, { "epoch": 38.24062701058665, "grad_norm": 0.13810501992702484, "learning_rate": 0.001, "loss": 1.8348, "step": 653800 }, { "epoch": 38.24647598993975, "grad_norm": 0.18973658978939056, "learning_rate": 0.001, "loss": 1.8322, "step": 653900 }, { "epoch": 38.25232496929286, "grad_norm": 0.221978560090065, "learning_rate": 0.001, "loss": 1.8331, "step": 654000 }, { "epoch": 38.25817394864596, "grad_norm": 0.22257182002067566, "learning_rate": 0.001, "loss": 1.8385, "step": 654100 }, { "epoch": 38.264022927999065, "grad_norm": 0.29896384477615356, "learning_rate": 0.001, "loss": 1.8368, "step": 654200 }, { "epoch": 38.26987190735217, "grad_norm": 0.1524696946144104, "learning_rate": 0.001, "loss": 1.8376, "step": 654300 }, { "epoch": 38.27572088670527, "grad_norm": 0.22388724982738495, "learning_rate": 0.001, "loss": 1.8402, "step": 654400 }, { "epoch": 38.281569866058376, "grad_norm": 0.18629810214042664, "learning_rate": 0.001, "loss": 1.8372, "step": 654500 }, { "epoch": 38.28741884541147, "grad_norm": 0.18399156630039215, "learning_rate": 0.001, "loss": 1.8325, "step": 654600 }, { "epoch": 38.29326782476458, "grad_norm": 0.17891032993793488, "learning_rate": 0.001, "loss": 1.8366, "step": 654700 }, { "epoch": 38.29911680411768, "grad_norm": 0.17161771655082703, "learning_rate": 0.001, "loss": 1.8347, "step": 654800 }, { "epoch": 38.304965783470784, "grad_norm": 0.1487145721912384, "learning_rate": 0.001, "loss": 1.8369, "step": 654900 }, { "epoch": 38.31081476282389, "grad_norm": 0.16480545699596405, "learning_rate": 0.001, "loss": 1.8404, "step": 655000 }, { "epoch": 38.31666374217699, "grad_norm": 0.17440055310726166, "learning_rate": 0.001, "loss": 1.8371, "step": 655100 }, { "epoch": 38.322512721530096, "grad_norm": 0.1839863657951355, "learning_rate": 0.001, "loss": 1.8363, "step": 655200 }, { "epoch": 38.32836170088319, "grad_norm": 0.21775788068771362, "learning_rate": 0.001, "loss": 1.8373, "step": 655300 }, { "epoch": 38.334210680236296, "grad_norm": 0.14544561505317688, "learning_rate": 0.001, "loss": 1.8406, "step": 655400 }, { "epoch": 38.3400596595894, "grad_norm": 0.18529483675956726, "learning_rate": 0.001, "loss": 1.8338, "step": 655500 }, { "epoch": 38.345908638942504, "grad_norm": 0.22129687666893005, "learning_rate": 0.001, "loss": 1.8415, "step": 655600 }, { "epoch": 38.35175761829561, "grad_norm": 0.17230723798274994, "learning_rate": 0.001, "loss": 1.8378, "step": 655700 }, { "epoch": 38.35760659764871, "grad_norm": 0.30147284269332886, "learning_rate": 0.001, "loss": 1.8396, "step": 655800 }, { "epoch": 38.363455577001815, "grad_norm": 0.16661129891872406, "learning_rate": 0.001, "loss": 1.8367, "step": 655900 }, { "epoch": 38.36930455635492, "grad_norm": 0.1730402112007141, "learning_rate": 0.001, "loss": 1.8399, "step": 656000 }, { "epoch": 38.375153535708016, "grad_norm": 0.24892744421958923, "learning_rate": 0.001, "loss": 1.8357, "step": 656100 }, { "epoch": 38.38100251506112, "grad_norm": 0.1701040416955948, "learning_rate": 0.001, "loss": 1.8387, "step": 656200 }, { "epoch": 38.38685149441422, "grad_norm": 0.15982882678508759, "learning_rate": 0.001, "loss": 1.8433, "step": 656300 }, { "epoch": 38.39270047376733, "grad_norm": 0.12968438863754272, "learning_rate": 0.001, "loss": 1.8374, "step": 656400 }, { "epoch": 38.39854945312043, "grad_norm": 0.1833108812570572, "learning_rate": 0.001, "loss": 1.8389, "step": 656500 }, { "epoch": 38.404398432473535, "grad_norm": 0.2565152049064636, "learning_rate": 0.001, "loss": 1.8432, "step": 656600 }, { "epoch": 38.41024741182664, "grad_norm": 0.17316949367523193, "learning_rate": 0.001, "loss": 1.8391, "step": 656700 }, { "epoch": 38.41609639117974, "grad_norm": 0.21787206828594208, "learning_rate": 0.001, "loss": 1.8438, "step": 656800 }, { "epoch": 38.42194537053284, "grad_norm": 0.2731776237487793, "learning_rate": 0.001, "loss": 1.8381, "step": 656900 }, { "epoch": 38.42779434988594, "grad_norm": 0.1698216050863266, "learning_rate": 0.001, "loss": 1.8421, "step": 657000 }, { "epoch": 38.43364332923905, "grad_norm": 0.16636863350868225, "learning_rate": 0.001, "loss": 1.8375, "step": 657100 }, { "epoch": 38.43949230859215, "grad_norm": 0.17013296484947205, "learning_rate": 0.001, "loss": 1.8381, "step": 657200 }, { "epoch": 38.445341287945254, "grad_norm": 0.19416455924510956, "learning_rate": 0.001, "loss": 1.8409, "step": 657300 }, { "epoch": 38.45119026729836, "grad_norm": 0.17785325646400452, "learning_rate": 0.001, "loss": 1.8403, "step": 657400 }, { "epoch": 38.45703924665146, "grad_norm": 0.17675918340682983, "learning_rate": 0.001, "loss": 1.833, "step": 657500 }, { "epoch": 38.462888226004566, "grad_norm": 0.1563183069229126, "learning_rate": 0.001, "loss": 1.8382, "step": 657600 }, { "epoch": 38.46873720535766, "grad_norm": 0.1362801492214203, "learning_rate": 0.001, "loss": 1.8324, "step": 657700 }, { "epoch": 38.474586184710766, "grad_norm": 0.153603658080101, "learning_rate": 0.001, "loss": 1.8394, "step": 657800 }, { "epoch": 38.48043516406387, "grad_norm": 0.17375236749649048, "learning_rate": 0.001, "loss": 1.8401, "step": 657900 }, { "epoch": 38.486284143416974, "grad_norm": 0.15454836189746857, "learning_rate": 0.001, "loss": 1.8396, "step": 658000 }, { "epoch": 38.49213312277008, "grad_norm": 0.2500344216823578, "learning_rate": 0.001, "loss": 1.8444, "step": 658100 }, { "epoch": 38.49798210212318, "grad_norm": 0.2253265529870987, "learning_rate": 0.001, "loss": 1.8381, "step": 658200 }, { "epoch": 38.503831081476285, "grad_norm": 0.15523745119571686, "learning_rate": 0.001, "loss": 1.841, "step": 658300 }, { "epoch": 38.50968006082938, "grad_norm": 0.19202709197998047, "learning_rate": 0.001, "loss": 1.8444, "step": 658400 }, { "epoch": 38.515529040182486, "grad_norm": 0.2588265836238861, "learning_rate": 0.001, "loss": 1.8376, "step": 658500 }, { "epoch": 38.52137801953559, "grad_norm": 0.1859479546546936, "learning_rate": 0.001, "loss": 1.8507, "step": 658600 }, { "epoch": 38.52722699888869, "grad_norm": 0.22494959831237793, "learning_rate": 0.001, "loss": 1.8408, "step": 658700 }, { "epoch": 38.5330759782418, "grad_norm": 0.266186386346817, "learning_rate": 0.001, "loss": 1.8436, "step": 658800 }, { "epoch": 38.5389249575949, "grad_norm": 0.2708130180835724, "learning_rate": 0.001, "loss": 1.8382, "step": 658900 }, { "epoch": 38.544773936948005, "grad_norm": 0.18203207850456238, "learning_rate": 0.001, "loss": 1.8438, "step": 659000 }, { "epoch": 38.55062291630111, "grad_norm": 0.22333098948001862, "learning_rate": 0.001, "loss": 1.841, "step": 659100 }, { "epoch": 38.556471895654205, "grad_norm": 0.18987137079238892, "learning_rate": 0.001, "loss": 1.8401, "step": 659200 }, { "epoch": 38.56232087500731, "grad_norm": 0.19019782543182373, "learning_rate": 0.001, "loss": 1.8416, "step": 659300 }, { "epoch": 38.56816985436041, "grad_norm": 0.16811053454875946, "learning_rate": 0.001, "loss": 1.8424, "step": 659400 }, { "epoch": 38.57401883371352, "grad_norm": 0.1547643095254898, "learning_rate": 0.001, "loss": 1.8339, "step": 659500 }, { "epoch": 38.57986781306662, "grad_norm": 0.16510561108589172, "learning_rate": 0.001, "loss": 1.8417, "step": 659600 }, { "epoch": 38.585716792419724, "grad_norm": 0.2082662135362625, "learning_rate": 0.001, "loss": 1.846, "step": 659700 }, { "epoch": 38.59156577177283, "grad_norm": 0.2060183435678482, "learning_rate": 0.001, "loss": 1.8365, "step": 659800 }, { "epoch": 38.59741475112593, "grad_norm": 0.24534553289413452, "learning_rate": 0.001, "loss": 1.8499, "step": 659900 }, { "epoch": 38.60326373047903, "grad_norm": 0.15086424350738525, "learning_rate": 0.001, "loss": 1.8392, "step": 660000 }, { "epoch": 38.60911270983213, "grad_norm": 0.15976977348327637, "learning_rate": 0.001, "loss": 1.8409, "step": 660100 }, { "epoch": 38.614961689185236, "grad_norm": 0.20953284204006195, "learning_rate": 0.001, "loss": 1.8422, "step": 660200 }, { "epoch": 38.62081066853834, "grad_norm": 0.17776525020599365, "learning_rate": 0.001, "loss": 1.8413, "step": 660300 }, { "epoch": 38.626659647891444, "grad_norm": 0.25035759806632996, "learning_rate": 0.001, "loss": 1.8373, "step": 660400 }, { "epoch": 38.63250862724455, "grad_norm": 0.18582193553447723, "learning_rate": 0.001, "loss": 1.8446, "step": 660500 }, { "epoch": 38.63835760659765, "grad_norm": 0.18537969887256622, "learning_rate": 0.001, "loss": 1.8432, "step": 660600 }, { "epoch": 38.644206585950755, "grad_norm": 0.25529682636260986, "learning_rate": 0.001, "loss": 1.8385, "step": 660700 }, { "epoch": 38.65005556530385, "grad_norm": 0.1535930335521698, "learning_rate": 0.001, "loss": 1.8369, "step": 660800 }, { "epoch": 38.655904544656956, "grad_norm": 0.14829021692276, "learning_rate": 0.001, "loss": 1.8393, "step": 660900 }, { "epoch": 38.66175352401006, "grad_norm": 0.20411767065525055, "learning_rate": 0.001, "loss": 1.8407, "step": 661000 }, { "epoch": 38.66760250336316, "grad_norm": 0.2600431442260742, "learning_rate": 0.001, "loss": 1.8454, "step": 661100 }, { "epoch": 38.67345148271627, "grad_norm": 0.2159990668296814, "learning_rate": 0.001, "loss": 1.8391, "step": 661200 }, { "epoch": 38.67930046206937, "grad_norm": 0.16413572430610657, "learning_rate": 0.001, "loss": 1.8418, "step": 661300 }, { "epoch": 38.685149441422475, "grad_norm": 0.21445360779762268, "learning_rate": 0.001, "loss": 1.8418, "step": 661400 }, { "epoch": 38.69099842077557, "grad_norm": 0.13790276646614075, "learning_rate": 0.001, "loss": 1.8391, "step": 661500 }, { "epoch": 38.696847400128675, "grad_norm": 0.16433396935462952, "learning_rate": 0.001, "loss": 1.8433, "step": 661600 }, { "epoch": 38.70269637948178, "grad_norm": 0.20753321051597595, "learning_rate": 0.001, "loss": 1.8425, "step": 661700 }, { "epoch": 38.70854535883488, "grad_norm": 0.15531432628631592, "learning_rate": 0.001, "loss": 1.8456, "step": 661800 }, { "epoch": 38.71439433818799, "grad_norm": 0.20284263789653778, "learning_rate": 0.001, "loss": 1.8403, "step": 661900 }, { "epoch": 38.72024331754109, "grad_norm": 0.18287983536720276, "learning_rate": 0.001, "loss": 1.8418, "step": 662000 }, { "epoch": 38.726092296894194, "grad_norm": 0.14915356040000916, "learning_rate": 0.001, "loss": 1.839, "step": 662100 }, { "epoch": 38.7319412762473, "grad_norm": 0.24591605365276337, "learning_rate": 0.001, "loss": 1.8408, "step": 662200 }, { "epoch": 38.737790255600395, "grad_norm": 0.17371241748332977, "learning_rate": 0.001, "loss": 1.8433, "step": 662300 }, { "epoch": 38.7436392349535, "grad_norm": 0.24552008509635925, "learning_rate": 0.001, "loss": 1.8452, "step": 662400 }, { "epoch": 38.7494882143066, "grad_norm": 0.18401727080345154, "learning_rate": 0.001, "loss": 1.8406, "step": 662500 }, { "epoch": 38.755337193659706, "grad_norm": 0.25097644329071045, "learning_rate": 0.001, "loss": 1.8381, "step": 662600 }, { "epoch": 38.76118617301281, "grad_norm": 0.17108605802059174, "learning_rate": 0.001, "loss": 1.8493, "step": 662700 }, { "epoch": 38.767035152365914, "grad_norm": 0.14224618673324585, "learning_rate": 0.001, "loss": 1.842, "step": 662800 }, { "epoch": 38.77288413171902, "grad_norm": 0.18241576850414276, "learning_rate": 0.001, "loss": 1.8397, "step": 662900 }, { "epoch": 38.77873311107212, "grad_norm": 0.1822984367609024, "learning_rate": 0.001, "loss": 1.8431, "step": 663000 }, { "epoch": 38.78458209042522, "grad_norm": 0.17725971341133118, "learning_rate": 0.001, "loss": 1.8446, "step": 663100 }, { "epoch": 38.79043106977832, "grad_norm": 0.2661587595939636, "learning_rate": 0.001, "loss": 1.8452, "step": 663200 }, { "epoch": 38.796280049131425, "grad_norm": 0.15890271961688995, "learning_rate": 0.001, "loss": 1.8394, "step": 663300 }, { "epoch": 38.80212902848453, "grad_norm": 0.1984507441520691, "learning_rate": 0.001, "loss": 1.8413, "step": 663400 }, { "epoch": 38.80797800783763, "grad_norm": 0.16289259493350983, "learning_rate": 0.001, "loss": 1.845, "step": 663500 }, { "epoch": 38.81382698719074, "grad_norm": 0.15997593104839325, "learning_rate": 0.001, "loss": 1.8415, "step": 663600 }, { "epoch": 38.81967596654384, "grad_norm": 0.20975729823112488, "learning_rate": 0.001, "loss": 1.8444, "step": 663700 }, { "epoch": 38.825524945896944, "grad_norm": 0.18361283838748932, "learning_rate": 0.001, "loss": 1.8417, "step": 663800 }, { "epoch": 38.83137392525004, "grad_norm": 0.13910050690174103, "learning_rate": 0.001, "loss": 1.8413, "step": 663900 }, { "epoch": 38.837222904603145, "grad_norm": 0.15939684212207794, "learning_rate": 0.001, "loss": 1.8396, "step": 664000 }, { "epoch": 38.84307188395625, "grad_norm": 0.2313763052225113, "learning_rate": 0.001, "loss": 1.8408, "step": 664100 }, { "epoch": 38.84892086330935, "grad_norm": 0.17499776184558868, "learning_rate": 0.001, "loss": 1.8359, "step": 664200 }, { "epoch": 38.854769842662456, "grad_norm": 0.18959805369377136, "learning_rate": 0.001, "loss": 1.8466, "step": 664300 }, { "epoch": 38.86061882201556, "grad_norm": 0.16273532807826996, "learning_rate": 0.001, "loss": 1.8431, "step": 664400 }, { "epoch": 38.866467801368664, "grad_norm": 0.18448127806186676, "learning_rate": 0.001, "loss": 1.842, "step": 664500 }, { "epoch": 38.87231678072176, "grad_norm": 0.1525171399116516, "learning_rate": 0.001, "loss": 1.8426, "step": 664600 }, { "epoch": 38.878165760074864, "grad_norm": 0.20233254134655, "learning_rate": 0.001, "loss": 1.8483, "step": 664700 }, { "epoch": 38.88401473942797, "grad_norm": 0.17759788036346436, "learning_rate": 0.001, "loss": 1.8319, "step": 664800 }, { "epoch": 38.88986371878107, "grad_norm": 0.16578684747219086, "learning_rate": 0.001, "loss": 1.8439, "step": 664900 }, { "epoch": 38.895712698134176, "grad_norm": 0.16721531748771667, "learning_rate": 0.001, "loss": 1.8452, "step": 665000 }, { "epoch": 38.90156167748728, "grad_norm": 0.27192598581314087, "learning_rate": 0.001, "loss": 1.8449, "step": 665100 }, { "epoch": 38.90741065684038, "grad_norm": 0.21724456548690796, "learning_rate": 0.001, "loss": 1.8442, "step": 665200 }, { "epoch": 38.91325963619349, "grad_norm": 0.17394381761550903, "learning_rate": 0.001, "loss": 1.8397, "step": 665300 }, { "epoch": 38.919108615546584, "grad_norm": 0.14390572905540466, "learning_rate": 0.001, "loss": 1.8388, "step": 665400 }, { "epoch": 38.92495759489969, "grad_norm": 0.19021327793598175, "learning_rate": 0.001, "loss": 1.8417, "step": 665500 }, { "epoch": 38.93080657425279, "grad_norm": 0.19095753133296967, "learning_rate": 0.001, "loss": 1.8438, "step": 665600 }, { "epoch": 38.936655553605895, "grad_norm": 0.2029448002576828, "learning_rate": 0.001, "loss": 1.8418, "step": 665700 }, { "epoch": 38.942504532959, "grad_norm": 0.15658964216709137, "learning_rate": 0.001, "loss": 1.8444, "step": 665800 }, { "epoch": 38.9483535123121, "grad_norm": 0.16871607303619385, "learning_rate": 0.001, "loss": 1.8403, "step": 665900 }, { "epoch": 38.95420249166521, "grad_norm": 0.15709583461284637, "learning_rate": 0.001, "loss": 1.8422, "step": 666000 }, { "epoch": 38.96005147101831, "grad_norm": 0.21927005052566528, "learning_rate": 0.001, "loss": 1.8473, "step": 666100 }, { "epoch": 38.96590045037141, "grad_norm": 0.31293705105781555, "learning_rate": 0.001, "loss": 1.8459, "step": 666200 }, { "epoch": 38.97174942972451, "grad_norm": 0.16380412876605988, "learning_rate": 0.001, "loss": 1.8388, "step": 666300 }, { "epoch": 38.977598409077615, "grad_norm": 0.17744086682796478, "learning_rate": 0.001, "loss": 1.8412, "step": 666400 }, { "epoch": 38.98344738843072, "grad_norm": 0.24600574374198914, "learning_rate": 0.001, "loss": 1.8455, "step": 666500 }, { "epoch": 38.98929636778382, "grad_norm": 0.2241915464401245, "learning_rate": 0.001, "loss": 1.8422, "step": 666600 }, { "epoch": 38.995145347136926, "grad_norm": 0.25121352076530457, "learning_rate": 0.001, "loss": 1.8493, "step": 666700 }, { "epoch": 39.00099432649003, "grad_norm": 0.19390107691287994, "learning_rate": 0.001, "loss": 1.8418, "step": 666800 }, { "epoch": 39.006843305843134, "grad_norm": 0.15790702402591705, "learning_rate": 0.001, "loss": 1.8264, "step": 666900 }, { "epoch": 39.01269228519623, "grad_norm": 0.19654767215251923, "learning_rate": 0.001, "loss": 1.8314, "step": 667000 }, { "epoch": 39.018541264549334, "grad_norm": 0.18752072751522064, "learning_rate": 0.001, "loss": 1.8303, "step": 667100 }, { "epoch": 39.02439024390244, "grad_norm": 0.17674799263477325, "learning_rate": 0.001, "loss": 1.8295, "step": 667200 }, { "epoch": 39.03023922325554, "grad_norm": 0.2291363626718521, "learning_rate": 0.001, "loss": 1.827, "step": 667300 }, { "epoch": 39.036088202608646, "grad_norm": 0.18392759561538696, "learning_rate": 0.001, "loss": 1.8323, "step": 667400 }, { "epoch": 39.04193718196175, "grad_norm": 0.15812143683433533, "learning_rate": 0.001, "loss": 1.8297, "step": 667500 }, { "epoch": 39.04778616131485, "grad_norm": 0.1878802329301834, "learning_rate": 0.001, "loss": 1.831, "step": 667600 }, { "epoch": 39.05363514066795, "grad_norm": 0.19134576618671417, "learning_rate": 0.001, "loss": 1.8285, "step": 667700 }, { "epoch": 39.059484120021054, "grad_norm": 0.20390821993350983, "learning_rate": 0.001, "loss": 1.8285, "step": 667800 }, { "epoch": 39.06533309937416, "grad_norm": 0.19277618825435638, "learning_rate": 0.001, "loss": 1.836, "step": 667900 }, { "epoch": 39.07118207872726, "grad_norm": 0.1725664734840393, "learning_rate": 0.001, "loss": 1.8352, "step": 668000 }, { "epoch": 39.077031058080365, "grad_norm": 0.17909055948257446, "learning_rate": 0.001, "loss": 1.8302, "step": 668100 }, { "epoch": 39.08288003743347, "grad_norm": 0.2318493127822876, "learning_rate": 0.001, "loss": 1.8262, "step": 668200 }, { "epoch": 39.08872901678657, "grad_norm": 0.17909519374370575, "learning_rate": 0.001, "loss": 1.8266, "step": 668300 }, { "epoch": 39.09457799613968, "grad_norm": 0.21318812668323517, "learning_rate": 0.001, "loss": 1.8298, "step": 668400 }, { "epoch": 39.10042697549277, "grad_norm": 0.19338509440422058, "learning_rate": 0.001, "loss": 1.8342, "step": 668500 }, { "epoch": 39.10627595484588, "grad_norm": 0.2652319371700287, "learning_rate": 0.001, "loss": 1.8277, "step": 668600 }, { "epoch": 39.11212493419898, "grad_norm": 0.16521179676055908, "learning_rate": 0.001, "loss": 1.8249, "step": 668700 }, { "epoch": 39.117973913552085, "grad_norm": 0.21550744771957397, "learning_rate": 0.001, "loss": 1.8323, "step": 668800 }, { "epoch": 39.12382289290519, "grad_norm": 0.24381600320339203, "learning_rate": 0.001, "loss": 1.8268, "step": 668900 }, { "epoch": 39.12967187225829, "grad_norm": 0.20407699048519135, "learning_rate": 0.001, "loss": 1.8332, "step": 669000 }, { "epoch": 39.135520851611396, "grad_norm": 0.1847168654203415, "learning_rate": 0.001, "loss": 1.8298, "step": 669100 }, { "epoch": 39.1413698309645, "grad_norm": 0.15757125616073608, "learning_rate": 0.001, "loss": 1.8277, "step": 669200 }, { "epoch": 39.1472188103176, "grad_norm": 0.19619092345237732, "learning_rate": 0.001, "loss": 1.833, "step": 669300 }, { "epoch": 39.1530677896707, "grad_norm": 0.18570388853549957, "learning_rate": 0.001, "loss": 1.831, "step": 669400 }, { "epoch": 39.158916769023804, "grad_norm": 0.1870509833097458, "learning_rate": 0.001, "loss": 1.8335, "step": 669500 }, { "epoch": 39.16476574837691, "grad_norm": 0.28692707419395447, "learning_rate": 0.001, "loss": 1.8372, "step": 669600 }, { "epoch": 39.17061472773001, "grad_norm": 0.20745986700057983, "learning_rate": 0.001, "loss": 1.8357, "step": 669700 }, { "epoch": 39.176463707083116, "grad_norm": 0.20278610289096832, "learning_rate": 0.001, "loss": 1.8335, "step": 669800 }, { "epoch": 39.18231268643622, "grad_norm": 0.20494535565376282, "learning_rate": 0.001, "loss": 1.8354, "step": 669900 }, { "epoch": 39.18816166578932, "grad_norm": 0.21131165325641632, "learning_rate": 0.001, "loss": 1.8353, "step": 670000 }, { "epoch": 39.19401064514242, "grad_norm": 0.19300471246242523, "learning_rate": 0.001, "loss": 1.8339, "step": 670100 }, { "epoch": 39.199859624495524, "grad_norm": 0.23607873916625977, "learning_rate": 0.001, "loss": 1.8305, "step": 670200 }, { "epoch": 39.20570860384863, "grad_norm": 0.25402241945266724, "learning_rate": 0.001, "loss": 1.8357, "step": 670300 }, { "epoch": 39.21155758320173, "grad_norm": 0.272894024848938, "learning_rate": 0.001, "loss": 1.8357, "step": 670400 }, { "epoch": 39.217406562554835, "grad_norm": 0.18910008668899536, "learning_rate": 0.001, "loss": 1.8297, "step": 670500 }, { "epoch": 39.22325554190794, "grad_norm": 0.1654253453016281, "learning_rate": 0.001, "loss": 1.8318, "step": 670600 }, { "epoch": 39.22910452126104, "grad_norm": 0.1864166408777237, "learning_rate": 0.001, "loss": 1.8361, "step": 670700 }, { "epoch": 39.23495350061414, "grad_norm": 0.21323101222515106, "learning_rate": 0.001, "loss": 1.8304, "step": 670800 }, { "epoch": 39.24080247996724, "grad_norm": 0.22038520872592926, "learning_rate": 0.001, "loss": 1.8325, "step": 670900 }, { "epoch": 39.24665145932035, "grad_norm": 0.26464176177978516, "learning_rate": 0.001, "loss": 1.8352, "step": 671000 }, { "epoch": 39.25250043867345, "grad_norm": 0.19635149836540222, "learning_rate": 0.001, "loss": 1.835, "step": 671100 }, { "epoch": 39.258349418026555, "grad_norm": 0.2081468105316162, "learning_rate": 0.001, "loss": 1.8333, "step": 671200 }, { "epoch": 39.26419839737966, "grad_norm": 0.1850726306438446, "learning_rate": 0.001, "loss": 1.8291, "step": 671300 }, { "epoch": 39.27004737673276, "grad_norm": 0.22316677868366241, "learning_rate": 0.001, "loss": 1.8357, "step": 671400 }, { "epoch": 39.275896356085866, "grad_norm": 0.21373866498470306, "learning_rate": 0.001, "loss": 1.8323, "step": 671500 }, { "epoch": 39.28174533543896, "grad_norm": 0.24263834953308105, "learning_rate": 0.001, "loss": 1.8335, "step": 671600 }, { "epoch": 39.28759431479207, "grad_norm": 0.18994702398777008, "learning_rate": 0.001, "loss": 1.8347, "step": 671700 }, { "epoch": 39.29344329414517, "grad_norm": 0.20201201736927032, "learning_rate": 0.001, "loss": 1.8354, "step": 671800 }, { "epoch": 39.299292273498274, "grad_norm": 0.19375447928905487, "learning_rate": 0.001, "loss": 1.8371, "step": 671900 }, { "epoch": 39.30514125285138, "grad_norm": 0.22439241409301758, "learning_rate": 0.001, "loss": 1.8346, "step": 672000 }, { "epoch": 39.31099023220448, "grad_norm": 0.20017193257808685, "learning_rate": 0.001, "loss": 1.8372, "step": 672100 }, { "epoch": 39.316839211557586, "grad_norm": 0.17448556423187256, "learning_rate": 0.001, "loss": 1.8367, "step": 672200 }, { "epoch": 39.32268819091069, "grad_norm": 0.1825675070285797, "learning_rate": 0.001, "loss": 1.8393, "step": 672300 }, { "epoch": 39.328537170263786, "grad_norm": 0.18351978063583374, "learning_rate": 0.001, "loss": 1.8337, "step": 672400 }, { "epoch": 39.33438614961689, "grad_norm": 0.1766163557767868, "learning_rate": 0.001, "loss": 1.8326, "step": 672500 }, { "epoch": 39.340235128969994, "grad_norm": 0.18500562012195587, "learning_rate": 0.001, "loss": 1.8366, "step": 672600 }, { "epoch": 39.3460841083231, "grad_norm": 0.22912059724330902, "learning_rate": 0.001, "loss": 1.8374, "step": 672700 }, { "epoch": 39.3519330876762, "grad_norm": 0.24634644389152527, "learning_rate": 0.001, "loss": 1.8381, "step": 672800 }, { "epoch": 39.357782067029305, "grad_norm": 0.23509231209754944, "learning_rate": 0.001, "loss": 1.8353, "step": 672900 }, { "epoch": 39.36363104638241, "grad_norm": 0.2269616425037384, "learning_rate": 0.001, "loss": 1.8334, "step": 673000 }, { "epoch": 39.36948002573551, "grad_norm": 0.17135098576545715, "learning_rate": 0.001, "loss": 1.8363, "step": 673100 }, { "epoch": 39.37532900508861, "grad_norm": 0.20086251199245453, "learning_rate": 0.001, "loss": 1.8376, "step": 673200 }, { "epoch": 39.38117798444171, "grad_norm": 0.19915494322776794, "learning_rate": 0.001, "loss": 1.8351, "step": 673300 }, { "epoch": 39.38702696379482, "grad_norm": 0.16998912394046783, "learning_rate": 0.001, "loss": 1.8305, "step": 673400 }, { "epoch": 39.39287594314792, "grad_norm": 0.19648145139217377, "learning_rate": 0.001, "loss": 1.8378, "step": 673500 }, { "epoch": 39.398724922501025, "grad_norm": 0.23611415922641754, "learning_rate": 0.001, "loss": 1.8375, "step": 673600 }, { "epoch": 39.40457390185413, "grad_norm": 0.22454258799552917, "learning_rate": 0.001, "loss": 1.8382, "step": 673700 }, { "epoch": 39.41042288120723, "grad_norm": 0.20236799120903015, "learning_rate": 0.001, "loss": 1.8311, "step": 673800 }, { "epoch": 39.41627186056033, "grad_norm": 0.21279509365558624, "learning_rate": 0.001, "loss": 1.8329, "step": 673900 }, { "epoch": 39.42212083991343, "grad_norm": 0.20177221298217773, "learning_rate": 0.001, "loss": 1.8377, "step": 674000 }, { "epoch": 39.427969819266536, "grad_norm": 0.190486878156662, "learning_rate": 0.001, "loss": 1.836, "step": 674100 }, { "epoch": 39.43381879861964, "grad_norm": 0.17065051198005676, "learning_rate": 0.001, "loss": 1.8304, "step": 674200 }, { "epoch": 39.439667777972744, "grad_norm": 0.16792194545269012, "learning_rate": 0.001, "loss": 1.8385, "step": 674300 }, { "epoch": 39.44551675732585, "grad_norm": 0.25764837861061096, "learning_rate": 0.001, "loss": 1.837, "step": 674400 }, { "epoch": 39.45136573667895, "grad_norm": 0.17016081511974335, "learning_rate": 0.001, "loss": 1.8385, "step": 674500 }, { "epoch": 39.457214716032055, "grad_norm": 0.21155694127082825, "learning_rate": 0.001, "loss": 1.8321, "step": 674600 }, { "epoch": 39.46306369538515, "grad_norm": 0.22620868682861328, "learning_rate": 0.001, "loss": 1.8401, "step": 674700 }, { "epoch": 39.468912674738256, "grad_norm": 0.19974756240844727, "learning_rate": 0.001, "loss": 1.8324, "step": 674800 }, { "epoch": 39.47476165409136, "grad_norm": 0.17969545722007751, "learning_rate": 0.001, "loss": 1.8363, "step": 674900 }, { "epoch": 39.48061063344446, "grad_norm": 0.2050425261259079, "learning_rate": 0.001, "loss": 1.8378, "step": 675000 }, { "epoch": 39.48061063344446, "eval_ag_news_accuracy": 0.238953125, "eval_ag_news_bleu_score": 7.917280326966226, "eval_ag_news_bleu_score_sem": 0.5380590856647519, "eval_ag_news_emb_cos_sim": 0.7131774425506592, "eval_ag_news_emb_cos_sim_sem": 0.01360347680747509, "eval_ag_news_emb_top1_equal": 0.9609375, "eval_ag_news_emb_top1_equal_sem": 0.017191974446177483, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7217817306518555, "eval_ag_news_n_ngrams_match_1": 14.625, "eval_ag_news_n_ngrams_match_2": 4.421875, "eval_ag_news_n_ngrams_match_3": 1.78125, "eval_ag_news_num_pred_words": 48.84375, "eval_ag_news_num_true_words": 46.2421875, "eval_ag_news_perplexity": 15.207393600172818, "eval_ag_news_pred_num_tokens": 73.3828125, "eval_ag_news_rouge_score": 0.30598920425155984, "eval_ag_news_runtime": 37.1758, "eval_ag_news_samples_per_second": 13.45, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3409047991368284, "eval_ag_news_token_set_f1_sem": 0.010187540442829518, "eval_ag_news_token_set_precision": 0.32320393145116805, "eval_ag_news_token_set_recall": 0.37062512218085036, "eval_ag_news_true_num_tokens": 64.375, "step": 675000 }, { "epoch": 39.48061063344446, "eval_anthropic_toxic_prompts_accuracy": 0.103671875, "eval_anthropic_toxic_prompts_bleu_score": 43.822980476896944, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.7669470992460643, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.888046145439148, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009443351998925209, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.15625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03221922171672519, "eval_anthropic_toxic_prompts_loss": 1.2553759813308716, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.7109375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.9140625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.8359375, "eval_anthropic_toxic_prompts_num_pred_words": 15.7421875, "eval_anthropic_toxic_prompts_num_true_words": 15.5390625, "eval_anthropic_toxic_prompts_perplexity": 3.5091575040515823, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.03125, "eval_anthropic_toxic_prompts_rouge_score": 0.6972314054759187, "eval_anthropic_toxic_prompts_runtime": 29.3269, "eval_anthropic_toxic_prompts_samples_per_second": 17.049, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.7113917212290094, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.019456722592408535, "eval_anthropic_toxic_prompts_token_set_precision": 0.7121721603808145, "eval_anthropic_toxic_prompts_token_set_recall": 0.7156723542765697, "eval_anthropic_toxic_prompts_true_num_tokens": 18.890625, "step": 675000 }, { "epoch": 39.48061063344446, "eval_arxiv_accuracy": 0.37659375, "eval_arxiv_bleu_score": 1.604121705048342, "eval_arxiv_bleu_score_sem": 0.12910600988809712, "eval_arxiv_emb_cos_sim": 0.5005611181259155, "eval_arxiv_emb_cos_sim_sem": 0.019268576055765152, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4384913444519043, "eval_arxiv_n_ngrams_match_1": 13.2421875, "eval_arxiv_n_ngrams_match_2": 2.390625, "eval_arxiv_n_ngrams_match_3": 0.375, "eval_arxiv_num_pred_words": 56.578125, "eval_arxiv_num_true_words": 86.40625, "eval_arxiv_perplexity": 31.139943264405744, "eval_arxiv_pred_num_tokens": 125.640625, "eval_arxiv_rouge_score": 0.1768530751350107, "eval_arxiv_runtime": 29.2808, "eval_arxiv_samples_per_second": 17.076, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.18371254357887534, "eval_arxiv_token_set_f1_sem": 0.008023842043369945, "eval_arxiv_token_set_precision": 0.12517458038544035, "eval_arxiv_token_set_recall": 0.46329638316700195, "eval_arxiv_true_num_tokens": 125.4375, "step": 675000 }, { "epoch": 39.48061063344446, "eval_python_code_alpaca_accuracy": 0.130578125, "eval_python_code_alpaca_bleu_score": 29.157186931471042, "eval_python_code_alpaca_bleu_score_sem": 1.5433086437037749, "eval_python_code_alpaca_emb_cos_sim": 0.8689022660255432, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008902308531105518, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.498102068901062, "eval_python_code_alpaca_n_ngrams_match_1": 10.703125, "eval_python_code_alpaca_n_ngrams_match_2": 5.9921875, "eval_python_code_alpaca_n_ngrams_match_3": 3.4375, "eval_python_code_alpaca_num_pred_words": 17.640625, "eval_python_code_alpaca_num_true_words": 18.2578125, "eval_python_code_alpaca_perplexity": 4.47319120001276, "eval_python_code_alpaca_pred_num_tokens": 22.5078125, "eval_python_code_alpaca_rouge_score": 0.6131419161461015, "eval_python_code_alpaca_runtime": 29.5526, "eval_python_code_alpaca_samples_per_second": 16.919, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6332330537810464, "eval_python_code_alpaca_token_set_f1_sem": 0.01262059435343718, "eval_python_code_alpaca_token_set_precision": 0.6239569917334427, "eval_python_code_alpaca_token_set_recall": 0.6478674330915123, "eval_python_code_alpaca_true_num_tokens": 23.34375, "step": 675000 }, { "epoch": 39.48061063344446, "eval_wikibio_accuracy": 0.364109375, "eval_wikibio_bleu_score": 7.8016065975133255, "eval_wikibio_bleu_score_sem": 0.6775188478831297, "eval_wikibio_emb_cos_sim": 0.6700987815856934, "eval_wikibio_emb_cos_sim_sem": 0.020561086013913155, "eval_wikibio_emb_top1_equal": 0.953125, "eval_wikibio_emb_top1_equal_sem": 0.01875615119934082, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.717292070388794, "eval_wikibio_n_ngrams_match_1": 15.78125, "eval_wikibio_n_ngrams_match_2": 5.578125, "eval_wikibio_n_ngrams_match_3": 2.203125, "eval_wikibio_num_pred_words": 54.609375, "eval_wikibio_num_true_words": 52.140625, "eval_wikibio_perplexity": 15.139270608395814, "eval_wikibio_pred_num_tokens": 105.125, "eval_wikibio_rouge_score": 0.31432416966444066, "eval_wikibio_runtime": 29.0454, "eval_wikibio_samples_per_second": 17.214, "eval_wikibio_steps_per_second": 0.034, "eval_wikibio_token_set_f1": 0.34108230295634645, "eval_wikibio_token_set_f1_sem": 0.011127967883367102, "eval_wikibio_token_set_precision": 0.3061758264202111, "eval_wikibio_token_set_recall": 0.4145483393781744, "eval_wikibio_true_num_tokens": 100.0, "step": 675000 }, { "epoch": 39.48061063344446, "eval_msmarco_accuracy": 0.400765625, "eval_msmarco_bleu_score": 17.50517676166592, "eval_msmarco_bleu_score_sem": 1.442960936100107, "eval_msmarco_emb_cos_sim": 0.7645117044448853, "eval_msmarco_emb_cos_sim_sem": 0.018225187435746193, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7381924390792847, "eval_msmarco_n_ngrams_match_1": 28.734375, "eval_msmarco_n_ngrams_match_2": 13.0859375, "eval_msmarco_n_ngrams_match_3": 7.3203125, "eval_msmarco_num_pred_words": 63.7265625, "eval_msmarco_num_true_words": 62.4765625, "eval_msmarco_perplexity": 5.687054429141999, "eval_msmarco_pred_num_tokens": 86.9296875, "eval_msmarco_rouge_score": 0.4423908250593636, "eval_msmarco_runtime": 25.9908, "eval_msmarco_samples_per_second": 19.238, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.4675217662321604, "eval_msmarco_token_set_f1_sem": 0.014651607557670216, "eval_msmarco_token_set_precision": 0.4336794763831852, "eval_msmarco_token_set_recall": 0.5250255809627047, "eval_msmarco_true_num_tokens": 81.640625, "step": 675000 }, { "epoch": 39.48645961279757, "grad_norm": 0.18495048582553864, "learning_rate": 0.001, "loss": 1.8366, "step": 675100 }, { "epoch": 39.49230859215067, "grad_norm": 0.16605713963508606, "learning_rate": 0.001, "loss": 1.8362, "step": 675200 }, { "epoch": 39.498157571503775, "grad_norm": 0.18823587894439697, "learning_rate": 0.001, "loss": 1.8376, "step": 675300 }, { "epoch": 39.50400655085688, "grad_norm": 0.2252536565065384, "learning_rate": 0.001, "loss": 1.8378, "step": 675400 }, { "epoch": 39.509855530209975, "grad_norm": 0.12275430560112, "learning_rate": 0.001, "loss": 1.8388, "step": 675500 }, { "epoch": 39.51570450956308, "grad_norm": 0.1910851001739502, "learning_rate": 0.001, "loss": 1.8335, "step": 675600 }, { "epoch": 39.52155348891618, "grad_norm": 0.20399172604084015, "learning_rate": 0.001, "loss": 1.8356, "step": 675700 }, { "epoch": 39.52740246826929, "grad_norm": 0.18918660283088684, "learning_rate": 0.001, "loss": 1.8403, "step": 675800 }, { "epoch": 39.53325144762239, "grad_norm": 0.22900503873825073, "learning_rate": 0.001, "loss": 1.8404, "step": 675900 }, { "epoch": 39.539100426975494, "grad_norm": 0.16548293828964233, "learning_rate": 0.001, "loss": 1.8346, "step": 676000 }, { "epoch": 39.5449494063286, "grad_norm": 0.2505449056625366, "learning_rate": 0.001, "loss": 1.83, "step": 676100 }, { "epoch": 39.5507983856817, "grad_norm": 0.174922376871109, "learning_rate": 0.001, "loss": 1.8324, "step": 676200 }, { "epoch": 39.5566473650348, "grad_norm": 0.17367874085903168, "learning_rate": 0.001, "loss": 1.8427, "step": 676300 }, { "epoch": 39.5624963443879, "grad_norm": 0.22932729125022888, "learning_rate": 0.001, "loss": 1.8341, "step": 676400 }, { "epoch": 39.568345323741006, "grad_norm": 0.18075434863567352, "learning_rate": 0.001, "loss": 1.8408, "step": 676500 }, { "epoch": 39.57419430309411, "grad_norm": 0.2504757046699524, "learning_rate": 0.001, "loss": 1.8391, "step": 676600 }, { "epoch": 39.580043282447214, "grad_norm": 0.1699330061674118, "learning_rate": 0.001, "loss": 1.8425, "step": 676700 }, { "epoch": 39.58589226180032, "grad_norm": 0.17011003196239471, "learning_rate": 0.001, "loss": 1.8353, "step": 676800 }, { "epoch": 39.59174124115342, "grad_norm": 0.2457464039325714, "learning_rate": 0.001, "loss": 1.8357, "step": 676900 }, { "epoch": 39.59759022050652, "grad_norm": 0.18244953453540802, "learning_rate": 0.001, "loss": 1.8399, "step": 677000 }, { "epoch": 39.60343919985962, "grad_norm": 0.20612744987010956, "learning_rate": 0.001, "loss": 1.8414, "step": 677100 }, { "epoch": 39.609288179212726, "grad_norm": 0.21604783833026886, "learning_rate": 0.001, "loss": 1.8383, "step": 677200 }, { "epoch": 39.61513715856583, "grad_norm": 0.18019938468933105, "learning_rate": 0.001, "loss": 1.8398, "step": 677300 }, { "epoch": 39.62098613791893, "grad_norm": 0.21106089651584625, "learning_rate": 0.001, "loss": 1.8358, "step": 677400 }, { "epoch": 39.62683511727204, "grad_norm": 0.18844760954380035, "learning_rate": 0.001, "loss": 1.834, "step": 677500 }, { "epoch": 39.63268409662514, "grad_norm": 0.17022019624710083, "learning_rate": 0.001, "loss": 1.8328, "step": 677600 }, { "epoch": 39.638533075978245, "grad_norm": 0.20693647861480713, "learning_rate": 0.001, "loss": 1.8388, "step": 677700 }, { "epoch": 39.64438205533134, "grad_norm": 0.16017812490463257, "learning_rate": 0.001, "loss": 1.8385, "step": 677800 }, { "epoch": 39.650231034684445, "grad_norm": 0.16879627108573914, "learning_rate": 0.001, "loss": 1.8391, "step": 677900 }, { "epoch": 39.65608001403755, "grad_norm": 0.16294419765472412, "learning_rate": 0.001, "loss": 1.8356, "step": 678000 }, { "epoch": 39.66192899339065, "grad_norm": 0.20176975429058075, "learning_rate": 0.001, "loss": 1.8365, "step": 678100 }, { "epoch": 39.66777797274376, "grad_norm": 0.20712800323963165, "learning_rate": 0.001, "loss": 1.8428, "step": 678200 }, { "epoch": 39.67362695209686, "grad_norm": 0.20598208904266357, "learning_rate": 0.001, "loss": 1.8368, "step": 678300 }, { "epoch": 39.679475931449964, "grad_norm": 0.25011301040649414, "learning_rate": 0.001, "loss": 1.8437, "step": 678400 }, { "epoch": 39.68532491080307, "grad_norm": 0.19550785422325134, "learning_rate": 0.001, "loss": 1.8347, "step": 678500 }, { "epoch": 39.691173890156165, "grad_norm": 0.2172870635986328, "learning_rate": 0.001, "loss": 1.84, "step": 678600 }, { "epoch": 39.69702286950927, "grad_norm": 0.1887778788805008, "learning_rate": 0.001, "loss": 1.8386, "step": 678700 }, { "epoch": 39.70287184886237, "grad_norm": 0.16827651858329773, "learning_rate": 0.001, "loss": 1.8421, "step": 678800 }, { "epoch": 39.708720828215476, "grad_norm": 0.1752002239227295, "learning_rate": 0.001, "loss": 1.8375, "step": 678900 }, { "epoch": 39.71456980756858, "grad_norm": 0.18890973925590515, "learning_rate": 0.001, "loss": 1.845, "step": 679000 }, { "epoch": 39.720418786921684, "grad_norm": 0.19708891212940216, "learning_rate": 0.001, "loss": 1.8395, "step": 679100 }, { "epoch": 39.72626776627479, "grad_norm": 0.2862039804458618, "learning_rate": 0.001, "loss": 1.8417, "step": 679200 }, { "epoch": 39.73211674562789, "grad_norm": 0.18081088364124298, "learning_rate": 0.001, "loss": 1.8368, "step": 679300 }, { "epoch": 39.73796572498099, "grad_norm": 0.21613936126232147, "learning_rate": 0.001, "loss": 1.8404, "step": 679400 }, { "epoch": 39.74381470433409, "grad_norm": 0.17799650132656097, "learning_rate": 0.001, "loss": 1.8366, "step": 679500 }, { "epoch": 39.749663683687196, "grad_norm": 0.19386081397533417, "learning_rate": 0.001, "loss": 1.8357, "step": 679600 }, { "epoch": 39.7555126630403, "grad_norm": 0.198768749833107, "learning_rate": 0.001, "loss": 1.8386, "step": 679700 }, { "epoch": 39.7613616423934, "grad_norm": 0.20760369300842285, "learning_rate": 0.001, "loss": 1.8397, "step": 679800 }, { "epoch": 39.76721062174651, "grad_norm": 0.2094574123620987, "learning_rate": 0.001, "loss": 1.8411, "step": 679900 }, { "epoch": 39.77305960109961, "grad_norm": 0.1815589964389801, "learning_rate": 0.001, "loss": 1.835, "step": 680000 }, { "epoch": 39.77890858045271, "grad_norm": 0.18645842373371124, "learning_rate": 0.001, "loss": 1.8414, "step": 680100 }, { "epoch": 39.78475755980581, "grad_norm": 0.17709635198116302, "learning_rate": 0.001, "loss": 1.8386, "step": 680200 }, { "epoch": 39.790606539158915, "grad_norm": 0.21575674414634705, "learning_rate": 0.001, "loss": 1.8442, "step": 680300 }, { "epoch": 39.79645551851202, "grad_norm": 0.22574256360530853, "learning_rate": 0.001, "loss": 1.8334, "step": 680400 }, { "epoch": 39.80230449786512, "grad_norm": 0.20754030346870422, "learning_rate": 0.001, "loss": 1.8407, "step": 680500 }, { "epoch": 39.80815347721823, "grad_norm": 0.1951066255569458, "learning_rate": 0.001, "loss": 1.8438, "step": 680600 }, { "epoch": 39.81400245657133, "grad_norm": 0.20578643679618835, "learning_rate": 0.001, "loss": 1.8382, "step": 680700 }, { "epoch": 39.819851435924434, "grad_norm": 0.2084241509437561, "learning_rate": 0.001, "loss": 1.8371, "step": 680800 }, { "epoch": 39.82570041527753, "grad_norm": 0.2442537248134613, "learning_rate": 0.001, "loss": 1.8418, "step": 680900 }, { "epoch": 39.831549394630635, "grad_norm": 0.17536015808582306, "learning_rate": 0.001, "loss": 1.8486, "step": 681000 }, { "epoch": 39.83739837398374, "grad_norm": 0.22143888473510742, "learning_rate": 0.001, "loss": 1.8409, "step": 681100 }, { "epoch": 39.84324735333684, "grad_norm": 0.20861473679542542, "learning_rate": 0.001, "loss": 1.8371, "step": 681200 }, { "epoch": 39.849096332689946, "grad_norm": 0.18794618546962738, "learning_rate": 0.001, "loss": 1.8466, "step": 681300 }, { "epoch": 39.85494531204305, "grad_norm": 0.19010086357593536, "learning_rate": 0.001, "loss": 1.8343, "step": 681400 }, { "epoch": 39.860794291396154, "grad_norm": 0.2115185409784317, "learning_rate": 0.001, "loss": 1.8434, "step": 681500 }, { "epoch": 39.86664327074926, "grad_norm": 0.18863791227340698, "learning_rate": 0.001, "loss": 1.8412, "step": 681600 }, { "epoch": 39.872492250102354, "grad_norm": 0.1818702220916748, "learning_rate": 0.001, "loss": 1.836, "step": 681700 }, { "epoch": 39.87834122945546, "grad_norm": 0.19586847722530365, "learning_rate": 0.001, "loss": 1.8401, "step": 681800 }, { "epoch": 39.88419020880856, "grad_norm": 0.15446504950523376, "learning_rate": 0.001, "loss": 1.8372, "step": 681900 }, { "epoch": 39.890039188161666, "grad_norm": 0.2262771725654602, "learning_rate": 0.001, "loss": 1.8372, "step": 682000 }, { "epoch": 39.89588816751477, "grad_norm": 0.24827642738819122, "learning_rate": 0.001, "loss": 1.8441, "step": 682100 }, { "epoch": 39.90173714686787, "grad_norm": 0.19790586829185486, "learning_rate": 0.001, "loss": 1.8395, "step": 682200 }, { "epoch": 39.90758612622098, "grad_norm": 0.19231277704238892, "learning_rate": 0.001, "loss": 1.8423, "step": 682300 }, { "epoch": 39.91343510557408, "grad_norm": 0.19336365163326263, "learning_rate": 0.001, "loss": 1.8425, "step": 682400 }, { "epoch": 39.91928408492718, "grad_norm": 0.20743627846240997, "learning_rate": 0.001, "loss": 1.8412, "step": 682500 }, { "epoch": 39.92513306428028, "grad_norm": 0.18178294599056244, "learning_rate": 0.001, "loss": 1.8398, "step": 682600 }, { "epoch": 39.930982043633385, "grad_norm": 0.1614583134651184, "learning_rate": 0.001, "loss": 1.8419, "step": 682700 }, { "epoch": 39.93683102298649, "grad_norm": 0.17847660183906555, "learning_rate": 0.001, "loss": 1.8368, "step": 682800 }, { "epoch": 39.94268000233959, "grad_norm": 0.1935366839170456, "learning_rate": 0.001, "loss": 1.8391, "step": 682900 }, { "epoch": 39.9485289816927, "grad_norm": 0.25726959109306335, "learning_rate": 0.001, "loss": 1.8399, "step": 683000 }, { "epoch": 39.9543779610458, "grad_norm": 0.21811047196388245, "learning_rate": 0.001, "loss": 1.836, "step": 683100 }, { "epoch": 39.9602269403989, "grad_norm": 0.17763742804527283, "learning_rate": 0.001, "loss": 1.844, "step": 683200 }, { "epoch": 39.966075919752, "grad_norm": 0.23462234437465668, "learning_rate": 0.001, "loss": 1.8407, "step": 683300 }, { "epoch": 39.971924899105105, "grad_norm": 0.16382580995559692, "learning_rate": 0.001, "loss": 1.835, "step": 683400 }, { "epoch": 39.97777387845821, "grad_norm": 0.24585509300231934, "learning_rate": 0.001, "loss": 1.8392, "step": 683500 }, { "epoch": 39.98362285781131, "grad_norm": 0.18830737471580505, "learning_rate": 0.001, "loss": 1.8394, "step": 683600 }, { "epoch": 39.989471837164416, "grad_norm": 0.2050073742866516, "learning_rate": 0.001, "loss": 1.8397, "step": 683700 }, { "epoch": 39.99532081651752, "grad_norm": 0.2404656857252121, "learning_rate": 0.001, "loss": 1.842, "step": 683800 }, { "epoch": 40.001169795870624, "grad_norm": 0.15479879081249237, "learning_rate": 0.001, "loss": 1.8414, "step": 683900 }, { "epoch": 40.00701877522372, "grad_norm": 0.20727461576461792, "learning_rate": 0.001, "loss": 1.8202, "step": 684000 }, { "epoch": 40.012867754576824, "grad_norm": 0.18630456924438477, "learning_rate": 0.001, "loss": 1.8235, "step": 684100 }, { "epoch": 40.01871673392993, "grad_norm": 0.18760499358177185, "learning_rate": 0.001, "loss": 1.8238, "step": 684200 }, { "epoch": 40.02456571328303, "grad_norm": 0.23370033502578735, "learning_rate": 0.001, "loss": 1.8234, "step": 684300 }, { "epoch": 40.030414692636135, "grad_norm": 0.177614226937294, "learning_rate": 0.001, "loss": 1.8293, "step": 684400 }, { "epoch": 40.03626367198924, "grad_norm": 0.18647490441799164, "learning_rate": 0.001, "loss": 1.8222, "step": 684500 }, { "epoch": 40.04211265134234, "grad_norm": 0.21199479699134827, "learning_rate": 0.001, "loss": 1.8283, "step": 684600 }, { "epoch": 40.04796163069545, "grad_norm": 0.18754136562347412, "learning_rate": 0.001, "loss": 1.8308, "step": 684700 }, { "epoch": 40.053810610048544, "grad_norm": 0.19939051568508148, "learning_rate": 0.001, "loss": 1.8187, "step": 684800 }, { "epoch": 40.05965958940165, "grad_norm": 0.14346811175346375, "learning_rate": 0.001, "loss": 1.8317, "step": 684900 }, { "epoch": 40.06550856875475, "grad_norm": 0.17370864748954773, "learning_rate": 0.001, "loss": 1.8237, "step": 685000 }, { "epoch": 40.071357548107855, "grad_norm": 0.20245645940303802, "learning_rate": 0.001, "loss": 1.8293, "step": 685100 }, { "epoch": 40.07720652746096, "grad_norm": 0.1684252768754959, "learning_rate": 0.001, "loss": 1.8278, "step": 685200 }, { "epoch": 40.08305550681406, "grad_norm": 0.2865194082260132, "learning_rate": 0.001, "loss": 1.8266, "step": 685300 }, { "epoch": 40.088904486167166, "grad_norm": 0.25223350524902344, "learning_rate": 0.001, "loss": 1.8336, "step": 685400 }, { "epoch": 40.09475346552027, "grad_norm": 0.2239122986793518, "learning_rate": 0.001, "loss": 1.8241, "step": 685500 }, { "epoch": 40.10060244487337, "grad_norm": 0.18348826467990875, "learning_rate": 0.001, "loss": 1.8371, "step": 685600 }, { "epoch": 40.10645142422647, "grad_norm": 0.2106427699327469, "learning_rate": 0.001, "loss": 1.8273, "step": 685700 }, { "epoch": 40.112300403579574, "grad_norm": 0.1886352300643921, "learning_rate": 0.001, "loss": 1.8241, "step": 685800 }, { "epoch": 40.11814938293268, "grad_norm": 0.22951287031173706, "learning_rate": 0.001, "loss": 1.8314, "step": 685900 }, { "epoch": 40.12399836228578, "grad_norm": 0.19282259047031403, "learning_rate": 0.001, "loss": 1.8282, "step": 686000 }, { "epoch": 40.129847341638886, "grad_norm": 0.209906205534935, "learning_rate": 0.001, "loss": 1.8314, "step": 686100 }, { "epoch": 40.13569632099199, "grad_norm": 0.21683065593242645, "learning_rate": 0.001, "loss": 1.8271, "step": 686200 }, { "epoch": 40.141545300345086, "grad_norm": 0.15746088325977325, "learning_rate": 0.001, "loss": 1.827, "step": 686300 }, { "epoch": 40.14739427969819, "grad_norm": 0.1716715693473816, "learning_rate": 0.001, "loss": 1.8249, "step": 686400 }, { "epoch": 40.153243259051294, "grad_norm": 0.14015312492847443, "learning_rate": 0.001, "loss": 1.8315, "step": 686500 }, { "epoch": 40.1590922384044, "grad_norm": 0.24368520081043243, "learning_rate": 0.001, "loss": 1.8285, "step": 686600 }, { "epoch": 40.1649412177575, "grad_norm": 0.2475828230381012, "learning_rate": 0.001, "loss": 1.8279, "step": 686700 }, { "epoch": 40.170790197110605, "grad_norm": 0.18485262989997864, "learning_rate": 0.001, "loss": 1.8287, "step": 686800 }, { "epoch": 40.17663917646371, "grad_norm": 0.18112698197364807, "learning_rate": 0.001, "loss": 1.8277, "step": 686900 }, { "epoch": 40.18248815581681, "grad_norm": 0.21316856145858765, "learning_rate": 0.001, "loss": 1.8296, "step": 687000 }, { "epoch": 40.18833713516991, "grad_norm": 0.18437139689922333, "learning_rate": 0.001, "loss": 1.8291, "step": 687100 }, { "epoch": 40.19418611452301, "grad_norm": 0.19719761610031128, "learning_rate": 0.001, "loss": 1.8265, "step": 687200 }, { "epoch": 40.20003509387612, "grad_norm": 0.17600828409194946, "learning_rate": 0.001, "loss": 1.8262, "step": 687300 }, { "epoch": 40.20588407322922, "grad_norm": 0.2322249561548233, "learning_rate": 0.001, "loss": 1.8289, "step": 687400 }, { "epoch": 40.211733052582325, "grad_norm": 0.1989295333623886, "learning_rate": 0.001, "loss": 1.8368, "step": 687500 }, { "epoch": 40.21758203193543, "grad_norm": 0.22749628126621246, "learning_rate": 0.001, "loss": 1.8285, "step": 687600 }, { "epoch": 40.22343101128853, "grad_norm": 0.19314716756343842, "learning_rate": 0.001, "loss": 1.8255, "step": 687700 }, { "epoch": 40.229279990641636, "grad_norm": 0.19002501666545868, "learning_rate": 0.001, "loss": 1.8353, "step": 687800 }, { "epoch": 40.23512896999473, "grad_norm": 0.18758367002010345, "learning_rate": 0.001, "loss": 1.8288, "step": 687900 }, { "epoch": 40.24097794934784, "grad_norm": 0.22027207911014557, "learning_rate": 0.001, "loss": 1.8317, "step": 688000 }, { "epoch": 40.24682692870094, "grad_norm": 0.24151922762393951, "learning_rate": 0.001, "loss": 1.8276, "step": 688100 }, { "epoch": 40.252675908054044, "grad_norm": 0.20024153590202332, "learning_rate": 0.001, "loss": 1.8305, "step": 688200 }, { "epoch": 40.25852488740715, "grad_norm": 0.17669963836669922, "learning_rate": 0.001, "loss": 1.8274, "step": 688300 }, { "epoch": 40.26437386676025, "grad_norm": 0.1877906322479248, "learning_rate": 0.001, "loss": 1.8288, "step": 688400 }, { "epoch": 40.270222846113356, "grad_norm": 0.1446625143289566, "learning_rate": 0.001, "loss": 1.8231, "step": 688500 }, { "epoch": 40.27607182546646, "grad_norm": 0.17413654923439026, "learning_rate": 0.001, "loss": 1.8305, "step": 688600 }, { "epoch": 40.281920804819556, "grad_norm": 0.16159193217754364, "learning_rate": 0.001, "loss": 1.8304, "step": 688700 }, { "epoch": 40.28776978417266, "grad_norm": 0.15200579166412354, "learning_rate": 0.001, "loss": 1.83, "step": 688800 }, { "epoch": 40.293618763525764, "grad_norm": 0.16195344924926758, "learning_rate": 0.001, "loss": 1.8255, "step": 688900 }, { "epoch": 40.29946774287887, "grad_norm": 0.23904410004615784, "learning_rate": 0.001, "loss": 1.834, "step": 689000 }, { "epoch": 40.30531672223197, "grad_norm": 0.17664165794849396, "learning_rate": 0.001, "loss": 1.8347, "step": 689100 }, { "epoch": 40.311165701585075, "grad_norm": 0.16977708041667938, "learning_rate": 0.001, "loss": 1.829, "step": 689200 }, { "epoch": 40.31701468093818, "grad_norm": 0.1718987673521042, "learning_rate": 0.001, "loss": 1.8323, "step": 689300 }, { "epoch": 40.322863660291276, "grad_norm": 0.18001659214496613, "learning_rate": 0.001, "loss": 1.8366, "step": 689400 }, { "epoch": 40.32871263964438, "grad_norm": 0.1833285093307495, "learning_rate": 0.001, "loss": 1.8312, "step": 689500 }, { "epoch": 40.33456161899748, "grad_norm": 0.21309292316436768, "learning_rate": 0.001, "loss": 1.8313, "step": 689600 }, { "epoch": 40.34041059835059, "grad_norm": 0.24371716380119324, "learning_rate": 0.001, "loss": 1.8389, "step": 689700 }, { "epoch": 40.34625957770369, "grad_norm": 0.18949586153030396, "learning_rate": 0.001, "loss": 1.8329, "step": 689800 }, { "epoch": 40.352108557056795, "grad_norm": 0.19382989406585693, "learning_rate": 0.001, "loss": 1.8319, "step": 689900 }, { "epoch": 40.3579575364099, "grad_norm": 0.16865801811218262, "learning_rate": 0.001, "loss": 1.8294, "step": 690000 }, { "epoch": 40.363806515763, "grad_norm": 0.20814211666584015, "learning_rate": 0.001, "loss": 1.8359, "step": 690100 }, { "epoch": 40.3696554951161, "grad_norm": 0.22721406817436218, "learning_rate": 0.001, "loss": 1.835, "step": 690200 }, { "epoch": 40.3755044744692, "grad_norm": 0.17793914675712585, "learning_rate": 0.001, "loss": 1.8314, "step": 690300 }, { "epoch": 40.38135345382231, "grad_norm": 0.18306131660938263, "learning_rate": 0.001, "loss": 1.8301, "step": 690400 }, { "epoch": 40.38720243317541, "grad_norm": 0.19880330562591553, "learning_rate": 0.001, "loss": 1.8324, "step": 690500 }, { "epoch": 40.393051412528514, "grad_norm": 0.17834778130054474, "learning_rate": 0.001, "loss": 1.8398, "step": 690600 }, { "epoch": 40.39890039188162, "grad_norm": 0.15348589420318604, "learning_rate": 0.001, "loss": 1.8289, "step": 690700 }, { "epoch": 40.40474937123472, "grad_norm": 0.20437441766262054, "learning_rate": 0.001, "loss": 1.834, "step": 690800 }, { "epoch": 40.410598350587826, "grad_norm": 0.17331384122371674, "learning_rate": 0.001, "loss": 1.8316, "step": 690900 }, { "epoch": 40.41644732994092, "grad_norm": 0.2149791568517685, "learning_rate": 0.001, "loss": 1.8415, "step": 691000 }, { "epoch": 40.422296309294026, "grad_norm": 0.16590386629104614, "learning_rate": 0.001, "loss": 1.8336, "step": 691100 }, { "epoch": 40.42814528864713, "grad_norm": 0.20703627169132233, "learning_rate": 0.001, "loss": 1.8311, "step": 691200 }, { "epoch": 40.433994268000234, "grad_norm": 0.21372556686401367, "learning_rate": 0.001, "loss": 1.8336, "step": 691300 }, { "epoch": 40.43984324735334, "grad_norm": 0.1690928339958191, "learning_rate": 0.001, "loss": 1.8371, "step": 691400 }, { "epoch": 40.44569222670644, "grad_norm": 0.19323672354221344, "learning_rate": 0.001, "loss": 1.8308, "step": 691500 }, { "epoch": 40.451541206059545, "grad_norm": 0.17374476790428162, "learning_rate": 0.001, "loss": 1.8393, "step": 691600 }, { "epoch": 40.45739018541265, "grad_norm": 0.17123842239379883, "learning_rate": 0.001, "loss": 1.831, "step": 691700 }, { "epoch": 40.463239164765746, "grad_norm": 0.20561186969280243, "learning_rate": 0.001, "loss": 1.8309, "step": 691800 }, { "epoch": 40.46908814411885, "grad_norm": 0.1606750339269638, "learning_rate": 0.001, "loss": 1.8299, "step": 691900 }, { "epoch": 40.47493712347195, "grad_norm": 0.16961516439914703, "learning_rate": 0.001, "loss": 1.8322, "step": 692000 }, { "epoch": 40.48078610282506, "grad_norm": 0.2329692244529724, "learning_rate": 0.001, "loss": 1.8341, "step": 692100 }, { "epoch": 40.48663508217816, "grad_norm": 0.17092572152614594, "learning_rate": 0.001, "loss": 1.8349, "step": 692200 }, { "epoch": 40.492484061531265, "grad_norm": 0.19062617421150208, "learning_rate": 0.001, "loss": 1.8359, "step": 692300 }, { "epoch": 40.49833304088437, "grad_norm": 0.20091770589351654, "learning_rate": 0.001, "loss": 1.8386, "step": 692400 }, { "epoch": 40.504182020237465, "grad_norm": 0.204321026802063, "learning_rate": 0.001, "loss": 1.8282, "step": 692500 }, { "epoch": 40.51003099959057, "grad_norm": 0.1742076724767685, "learning_rate": 0.001, "loss": 1.837, "step": 692600 }, { "epoch": 40.51587997894367, "grad_norm": 0.22611244022846222, "learning_rate": 0.001, "loss": 1.8322, "step": 692700 }, { "epoch": 40.52172895829678, "grad_norm": 0.22874774038791656, "learning_rate": 0.001, "loss": 1.8289, "step": 692800 }, { "epoch": 40.52757793764988, "grad_norm": 0.2885047197341919, "learning_rate": 0.001, "loss": 1.8307, "step": 692900 }, { "epoch": 40.533426917002984, "grad_norm": 0.20063328742980957, "learning_rate": 0.001, "loss": 1.8395, "step": 693000 }, { "epoch": 40.53927589635609, "grad_norm": 0.21551714837551117, "learning_rate": 0.001, "loss": 1.8398, "step": 693100 }, { "epoch": 40.54512487570919, "grad_norm": 0.25402384996414185, "learning_rate": 0.001, "loss": 1.8316, "step": 693200 }, { "epoch": 40.55097385506229, "grad_norm": 0.1990257203578949, "learning_rate": 0.001, "loss": 1.8319, "step": 693300 }, { "epoch": 40.55682283441539, "grad_norm": 0.15549980103969574, "learning_rate": 0.001, "loss": 1.8322, "step": 693400 }, { "epoch": 40.562671813768496, "grad_norm": 0.18429064750671387, "learning_rate": 0.001, "loss": 1.8404, "step": 693500 }, { "epoch": 40.5685207931216, "grad_norm": 0.2139771729707718, "learning_rate": 0.001, "loss": 1.833, "step": 693600 }, { "epoch": 40.574369772474704, "grad_norm": 0.1797100305557251, "learning_rate": 0.001, "loss": 1.8338, "step": 693700 }, { "epoch": 40.58021875182781, "grad_norm": 0.14165234565734863, "learning_rate": 0.001, "loss": 1.8343, "step": 693800 }, { "epoch": 40.58606773118091, "grad_norm": 0.19076873362064362, "learning_rate": 0.001, "loss": 1.8359, "step": 693900 }, { "epoch": 40.591916710534015, "grad_norm": 0.17413224279880524, "learning_rate": 0.001, "loss": 1.8327, "step": 694000 }, { "epoch": 40.59776568988711, "grad_norm": 0.1865386813879013, "learning_rate": 0.001, "loss": 1.836, "step": 694100 }, { "epoch": 40.603614669240216, "grad_norm": 0.19785094261169434, "learning_rate": 0.001, "loss": 1.8356, "step": 694200 }, { "epoch": 40.60946364859332, "grad_norm": 0.2252686768770218, "learning_rate": 0.001, "loss": 1.8371, "step": 694300 }, { "epoch": 40.61531262794642, "grad_norm": 0.19821378588676453, "learning_rate": 0.001, "loss": 1.8371, "step": 694400 }, { "epoch": 40.62116160729953, "grad_norm": 0.18760311603546143, "learning_rate": 0.001, "loss": 1.8382, "step": 694500 }, { "epoch": 40.62701058665263, "grad_norm": 0.17590124905109406, "learning_rate": 0.001, "loss": 1.8333, "step": 694600 }, { "epoch": 40.632859566005735, "grad_norm": 0.19984795153141022, "learning_rate": 0.001, "loss": 1.8327, "step": 694700 }, { "epoch": 40.63870854535884, "grad_norm": 0.20432166755199432, "learning_rate": 0.001, "loss": 1.8359, "step": 694800 }, { "epoch": 40.644557524711935, "grad_norm": 0.17721131443977356, "learning_rate": 0.001, "loss": 1.8355, "step": 694900 }, { "epoch": 40.65040650406504, "grad_norm": 0.18811854720115662, "learning_rate": 0.001, "loss": 1.8346, "step": 695000 }, { "epoch": 40.65625548341814, "grad_norm": 0.18164265155792236, "learning_rate": 0.001, "loss": 1.8389, "step": 695100 }, { "epoch": 40.66210446277125, "grad_norm": 0.18667680025100708, "learning_rate": 0.001, "loss": 1.8368, "step": 695200 }, { "epoch": 40.66795344212435, "grad_norm": 0.18943244218826294, "learning_rate": 0.001, "loss": 1.8413, "step": 695300 }, { "epoch": 40.673802421477454, "grad_norm": 0.16101525723934174, "learning_rate": 0.001, "loss": 1.8372, "step": 695400 }, { "epoch": 40.67965140083056, "grad_norm": 0.26345324516296387, "learning_rate": 0.001, "loss": 1.8378, "step": 695500 }, { "epoch": 40.685500380183655, "grad_norm": 0.22105669975280762, "learning_rate": 0.001, "loss": 1.8323, "step": 695600 }, { "epoch": 40.69134935953676, "grad_norm": 0.21313561499118805, "learning_rate": 0.001, "loss": 1.8415, "step": 695700 }, { "epoch": 40.69719833888986, "grad_norm": 0.20183220505714417, "learning_rate": 0.001, "loss": 1.8323, "step": 695800 }, { "epoch": 40.703047318242966, "grad_norm": 0.1838710606098175, "learning_rate": 0.001, "loss": 1.8359, "step": 695900 }, { "epoch": 40.70889629759607, "grad_norm": 0.16736020147800446, "learning_rate": 0.001, "loss": 1.8381, "step": 696000 }, { "epoch": 40.714745276949174, "grad_norm": 0.17245711386203766, "learning_rate": 0.001, "loss": 1.8367, "step": 696100 }, { "epoch": 40.72059425630228, "grad_norm": 0.19313044846057892, "learning_rate": 0.001, "loss": 1.8353, "step": 696200 }, { "epoch": 40.72644323565538, "grad_norm": 0.17699359357357025, "learning_rate": 0.001, "loss": 1.842, "step": 696300 }, { "epoch": 40.73229221500848, "grad_norm": 0.24560357630252838, "learning_rate": 0.001, "loss": 1.8382, "step": 696400 }, { "epoch": 40.73814119436158, "grad_norm": 0.1854204386472702, "learning_rate": 0.001, "loss": 1.8318, "step": 696500 }, { "epoch": 40.743990173714685, "grad_norm": 0.17917710542678833, "learning_rate": 0.001, "loss": 1.8359, "step": 696600 }, { "epoch": 40.74983915306779, "grad_norm": 0.1674896478652954, "learning_rate": 0.001, "loss": 1.8298, "step": 696700 }, { "epoch": 40.75568813242089, "grad_norm": 0.19453588128089905, "learning_rate": 0.001, "loss": 1.8353, "step": 696800 }, { "epoch": 40.761537111774, "grad_norm": 0.1427377164363861, "learning_rate": 0.001, "loss": 1.8404, "step": 696900 }, { "epoch": 40.7673860911271, "grad_norm": 0.18482545018196106, "learning_rate": 0.001, "loss": 1.839, "step": 697000 }, { "epoch": 40.773235070480204, "grad_norm": 0.21349216997623444, "learning_rate": 0.001, "loss": 1.8314, "step": 697100 }, { "epoch": 40.7790840498333, "grad_norm": 0.16720063984394073, "learning_rate": 0.001, "loss": 1.8368, "step": 697200 }, { "epoch": 40.784933029186405, "grad_norm": 0.14613115787506104, "learning_rate": 0.001, "loss": 1.8316, "step": 697300 }, { "epoch": 40.79078200853951, "grad_norm": 0.16729721426963806, "learning_rate": 0.001, "loss": 1.8314, "step": 697400 }, { "epoch": 40.79663098789261, "grad_norm": 0.1537850797176361, "learning_rate": 0.001, "loss": 1.8376, "step": 697500 }, { "epoch": 40.802479967245716, "grad_norm": 0.1959483027458191, "learning_rate": 0.001, "loss": 1.8338, "step": 697600 }, { "epoch": 40.80832894659882, "grad_norm": 0.1732160598039627, "learning_rate": 0.001, "loss": 1.8419, "step": 697700 }, { "epoch": 40.814177925951924, "grad_norm": 0.2034706473350525, "learning_rate": 0.001, "loss": 1.836, "step": 697800 }, { "epoch": 40.82002690530503, "grad_norm": 0.19678938388824463, "learning_rate": 0.001, "loss": 1.8377, "step": 697900 }, { "epoch": 40.825875884658124, "grad_norm": 0.20608405768871307, "learning_rate": 0.001, "loss": 1.8401, "step": 698000 }, { "epoch": 40.83172486401123, "grad_norm": 0.17613276839256287, "learning_rate": 0.001, "loss": 1.8376, "step": 698100 }, { "epoch": 40.83757384336433, "grad_norm": 0.17487818002700806, "learning_rate": 0.001, "loss": 1.828, "step": 698200 }, { "epoch": 40.843422822717436, "grad_norm": 0.19540145993232727, "learning_rate": 0.001, "loss": 1.8311, "step": 698300 }, { "epoch": 40.84927180207054, "grad_norm": 0.2096787989139557, "learning_rate": 0.001, "loss": 1.8422, "step": 698400 }, { "epoch": 40.85512078142364, "grad_norm": 0.15921348333358765, "learning_rate": 0.001, "loss": 1.8396, "step": 698500 }, { "epoch": 40.86096976077675, "grad_norm": 0.16511382162570953, "learning_rate": 0.001, "loss": 1.8323, "step": 698600 }, { "epoch": 40.866818740129844, "grad_norm": 0.18165826797485352, "learning_rate": 0.001, "loss": 1.8347, "step": 698700 }, { "epoch": 40.87266771948295, "grad_norm": 0.1932382881641388, "learning_rate": 0.001, "loss": 1.8347, "step": 698800 }, { "epoch": 40.87851669883605, "grad_norm": 0.16885851323604584, "learning_rate": 0.001, "loss": 1.8369, "step": 698900 }, { "epoch": 40.884365678189155, "grad_norm": 0.17663273215293884, "learning_rate": 0.001, "loss": 1.8419, "step": 699000 }, { "epoch": 40.89021465754226, "grad_norm": 0.1930493861436844, "learning_rate": 0.001, "loss": 1.8368, "step": 699100 }, { "epoch": 40.89606363689536, "grad_norm": 0.17924368381500244, "learning_rate": 0.001, "loss": 1.8423, "step": 699200 }, { "epoch": 40.90191261624847, "grad_norm": 0.2062830626964569, "learning_rate": 0.001, "loss": 1.8373, "step": 699300 }, { "epoch": 40.90776159560157, "grad_norm": 0.17755061388015747, "learning_rate": 0.001, "loss": 1.8359, "step": 699400 }, { "epoch": 40.91361057495467, "grad_norm": 0.15658509731292725, "learning_rate": 0.001, "loss": 1.8345, "step": 699500 }, { "epoch": 40.91945955430777, "grad_norm": 0.20307952165603638, "learning_rate": 0.001, "loss": 1.8386, "step": 699600 }, { "epoch": 40.925308533660875, "grad_norm": 0.24043114483356476, "learning_rate": 0.001, "loss": 1.8413, "step": 699700 }, { "epoch": 40.93115751301398, "grad_norm": 0.18094471096992493, "learning_rate": 0.001, "loss": 1.8422, "step": 699800 }, { "epoch": 40.93700649236708, "grad_norm": 0.19180451333522797, "learning_rate": 0.001, "loss": 1.8345, "step": 699900 }, { "epoch": 40.942855471720186, "grad_norm": 0.1798315793275833, "learning_rate": 0.001, "loss": 1.839, "step": 700000 }, { "epoch": 40.942855471720186, "eval_ag_news_accuracy": 0.242359375, "eval_ag_news_bleu_score": 7.391733041947083, "eval_ag_news_bleu_score_sem": 0.5729905314357066, "eval_ag_news_emb_cos_sim": 0.7131178379058838, "eval_ag_news_emb_cos_sim_sem": 0.013214001432061195, "eval_ag_news_emb_top1_equal": 0.984375, "eval_ag_news_emb_top1_equal_sem": 0.011004959233105183, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6754493713378906, "eval_ag_news_n_ngrams_match_1": 14.3984375, "eval_ag_news_n_ngrams_match_2": 4.234375, "eval_ag_news_n_ngrams_match_3": 1.703125, "eval_ag_news_num_pred_words": 46.84375, "eval_ag_news_num_true_words": 46.203125, "eval_ag_news_perplexity": 14.518872739146552, "eval_ag_news_pred_num_tokens": 70.390625, "eval_ag_news_rouge_score": 0.29697457652196124, "eval_ag_news_runtime": 37.2503, "eval_ag_news_samples_per_second": 13.423, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3330492089126439, "eval_ag_news_token_set_f1_sem": 0.010067090196398933, "eval_ag_news_token_set_precision": 0.3160268447750329, "eval_ag_news_token_set_recall": 0.3597098813014255, "eval_ag_news_true_num_tokens": 63.7890625, "step": 700000 }, { "epoch": 40.942855471720186, "eval_anthropic_toxic_prompts_accuracy": 0.105359375, "eval_anthropic_toxic_prompts_bleu_score": 45.65058692096642, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.686834085008659, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9042806625366211, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008719297125935555, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1484375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0315484639796987, "eval_anthropic_toxic_prompts_loss": 1.263983130455017, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 10.21875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.3984375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.2734375, "eval_anthropic_toxic_prompts_num_pred_words": 16.109375, "eval_anthropic_toxic_prompts_num_true_words": 16.0, "eval_anthropic_toxic_prompts_perplexity": 3.5394917042107124, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.90625, "eval_anthropic_toxic_prompts_rouge_score": 0.6994327423760456, "eval_anthropic_toxic_prompts_runtime": 31.0889, "eval_anthropic_toxic_prompts_samples_per_second": 16.083, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.7242981704820028, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01725219460560456, "eval_anthropic_toxic_prompts_token_set_precision": 0.7206662281099964, "eval_anthropic_toxic_prompts_token_set_recall": 0.7322435553950049, "eval_anthropic_toxic_prompts_true_num_tokens": 19.4296875, "step": 700000 }, { "epoch": 40.942855471720186, "eval_arxiv_accuracy": 0.375125, "eval_arxiv_bleu_score": 1.8153135397252917, "eval_arxiv_bleu_score_sem": 0.17568437745532384, "eval_arxiv_emb_cos_sim": 0.4962964951992035, "eval_arxiv_emb_cos_sim_sem": 0.02049095369875431, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.430067300796509, "eval_arxiv_n_ngrams_match_1": 13.8828125, "eval_arxiv_n_ngrams_match_2": 2.3671875, "eval_arxiv_n_ngrams_match_3": 0.5, "eval_arxiv_num_pred_words": 56.4140625, "eval_arxiv_num_true_words": 86.7421875, "eval_arxiv_perplexity": 30.878720842255472, "eval_arxiv_pred_num_tokens": 125.4140625, "eval_arxiv_rouge_score": 0.18377873328876682, "eval_arxiv_runtime": 31.1948, "eval_arxiv_samples_per_second": 16.028, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.17827495397938853, "eval_arxiv_token_set_f1_sem": 0.008001564434302638, "eval_arxiv_token_set_precision": 0.12156909136182639, "eval_arxiv_token_set_recall": 0.4113106335684658, "eval_arxiv_true_num_tokens": 125.7109375, "step": 700000 }, { "epoch": 40.942855471720186, "eval_python_code_alpaca_accuracy": 0.12928125, "eval_python_code_alpaca_bleu_score": 29.50954873812661, "eval_python_code_alpaca_bleu_score_sem": 1.9159013709988273, "eval_python_code_alpaca_emb_cos_sim": 0.8809174299240112, "eval_python_code_alpaca_emb_cos_sim_sem": 0.007120899390429258, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4145230054855347, "eval_python_code_alpaca_n_ngrams_match_1": 10.46875, "eval_python_code_alpaca_n_ngrams_match_2": 5.8203125, "eval_python_code_alpaca_n_ngrams_match_3": 3.375, "eval_python_code_alpaca_num_pred_words": 17.734375, "eval_python_code_alpaca_num_true_words": 18.8125, "eval_python_code_alpaca_perplexity": 4.114523392734923, "eval_python_code_alpaca_pred_num_tokens": 24.7265625, "eval_python_code_alpaca_rouge_score": 0.6036051862580523, "eval_python_code_alpaca_runtime": 30.451, "eval_python_code_alpaca_samples_per_second": 16.42, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6217296637479852, "eval_python_code_alpaca_token_set_f1_sem": 0.014395975381979268, "eval_python_code_alpaca_token_set_precision": 0.6098586219837593, "eval_python_code_alpaca_token_set_recall": 0.6413646664213808, "eval_python_code_alpaca_true_num_tokens": 24.6484375, "step": 700000 }, { "epoch": 40.942855471720186, "eval_wikibio_accuracy": 0.368015625, "eval_wikibio_bleu_score": 7.523010827599203, "eval_wikibio_bleu_score_sem": 0.691851377107065, "eval_wikibio_emb_cos_sim": 0.6069284081459045, "eval_wikibio_emb_cos_sim_sem": 0.02264026552438736, "eval_wikibio_emb_top1_equal": 0.90625, "eval_wikibio_emb_top1_equal_sem": 0.025864720344543457, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.68471622467041, "eval_wikibio_n_ngrams_match_1": 15.7109375, "eval_wikibio_n_ngrams_match_2": 5.3671875, "eval_wikibio_n_ngrams_match_3": 2.1875, "eval_wikibio_num_pred_words": 54.9921875, "eval_wikibio_num_true_words": 51.890625, "eval_wikibio_perplexity": 14.654042334480028, "eval_wikibio_pred_num_tokens": 106.5390625, "eval_wikibio_rouge_score": 0.31077070350890723, "eval_wikibio_runtime": 31.0218, "eval_wikibio_samples_per_second": 16.118, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32439426791526776, "eval_wikibio_token_set_f1_sem": 0.012171009048215236, "eval_wikibio_token_set_precision": 0.2922064393322459, "eval_wikibio_token_set_recall": 0.409338530705313, "eval_wikibio_true_num_tokens": 102.015625, "step": 700000 }, { "epoch": 40.942855471720186, "eval_msmarco_accuracy": 0.4020625, "eval_msmarco_bleu_score": 18.407474444691537, "eval_msmarco_bleu_score_sem": 1.566219289749529, "eval_msmarco_emb_cos_sim": 0.8133162260055542, "eval_msmarco_emb_cos_sim_sem": 0.015435080975294113, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7021925449371338, "eval_msmarco_n_ngrams_match_1": 28.84375, "eval_msmarco_n_ngrams_match_2": 13.6171875, "eval_msmarco_n_ngrams_match_3": 8.0234375, "eval_msmarco_num_pred_words": 59.6015625, "eval_msmarco_num_true_words": 61.9765625, "eval_msmarco_perplexity": 5.485962434314185, "eval_msmarco_pred_num_tokens": 80.0234375, "eval_msmarco_rouge_score": 0.4592865342508042, "eval_msmarco_runtime": 25.6274, "eval_msmarco_samples_per_second": 19.51, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.48937523359378743, "eval_msmarco_token_set_f1_sem": 0.014022988139218283, "eval_msmarco_token_set_precision": 0.45428091875932747, "eval_msmarco_token_set_recall": 0.5472663998428255, "eval_msmarco_true_num_tokens": 79.1953125, "step": 700000 }, { "epoch": 40.94870445107329, "grad_norm": 0.22204335033893585, "learning_rate": 0.001, "loss": 1.8416, "step": 700100 }, { "epoch": 40.954553430426394, "grad_norm": 0.15303783118724823, "learning_rate": 0.001, "loss": 1.8342, "step": 700200 }, { "epoch": 40.96040240977949, "grad_norm": 0.21111878752708435, "learning_rate": 0.001, "loss": 1.836, "step": 700300 }, { "epoch": 40.966251389132594, "grad_norm": 0.21408611536026, "learning_rate": 0.001, "loss": 1.8353, "step": 700400 }, { "epoch": 40.9721003684857, "grad_norm": 0.1918196827173233, "learning_rate": 0.001, "loss": 1.8382, "step": 700500 }, { "epoch": 40.9779493478388, "grad_norm": 0.1777401566505432, "learning_rate": 0.001, "loss": 1.8367, "step": 700600 }, { "epoch": 40.983798327191906, "grad_norm": 0.17319412529468536, "learning_rate": 0.001, "loss": 1.8428, "step": 700700 }, { "epoch": 40.98964730654501, "grad_norm": 0.19117212295532227, "learning_rate": 0.001, "loss": 1.8388, "step": 700800 }, { "epoch": 40.99549628589811, "grad_norm": 0.2071218341588974, "learning_rate": 0.001, "loss": 1.8401, "step": 700900 }, { "epoch": 41.00134526525122, "grad_norm": 0.18362802267074585, "learning_rate": 0.001, "loss": 1.834, "step": 701000 }, { "epoch": 41.007194244604314, "grad_norm": 0.1837093085050583, "learning_rate": 0.001, "loss": 1.8155, "step": 701100 }, { "epoch": 41.01304322395742, "grad_norm": 0.21181438863277435, "learning_rate": 0.001, "loss": 1.8216, "step": 701200 }, { "epoch": 41.01889220331052, "grad_norm": 0.3038765490055084, "learning_rate": 0.001, "loss": 1.8216, "step": 701300 }, { "epoch": 41.024741182663625, "grad_norm": 0.15442955493927002, "learning_rate": 0.001, "loss": 1.8191, "step": 701400 }, { "epoch": 41.03059016201673, "grad_norm": 0.26664137840270996, "learning_rate": 0.001, "loss": 1.8273, "step": 701500 }, { "epoch": 41.03643914136983, "grad_norm": 0.21343888342380524, "learning_rate": 0.001, "loss": 1.8261, "step": 701600 }, { "epoch": 41.04228812072294, "grad_norm": 0.21725131571292877, "learning_rate": 0.001, "loss": 1.8234, "step": 701700 }, { "epoch": 41.04813710007603, "grad_norm": 0.1943342536687851, "learning_rate": 0.001, "loss": 1.8264, "step": 701800 }, { "epoch": 41.05398607942914, "grad_norm": 0.19890096783638, "learning_rate": 0.001, "loss": 1.8228, "step": 701900 }, { "epoch": 41.05983505878224, "grad_norm": 0.23947350680828094, "learning_rate": 0.001, "loss": 1.8269, "step": 702000 }, { "epoch": 41.065684038135345, "grad_norm": 0.19909733533859253, "learning_rate": 0.001, "loss": 1.8248, "step": 702100 }, { "epoch": 41.07153301748845, "grad_norm": 0.20522506535053253, "learning_rate": 0.001, "loss": 1.8217, "step": 702200 }, { "epoch": 41.07738199684155, "grad_norm": 0.19599364697933197, "learning_rate": 0.001, "loss": 1.828, "step": 702300 }, { "epoch": 41.083230976194656, "grad_norm": 0.17372699081897736, "learning_rate": 0.001, "loss": 1.8277, "step": 702400 }, { "epoch": 41.08907995554776, "grad_norm": 0.21449314057826996, "learning_rate": 0.001, "loss": 1.8245, "step": 702500 }, { "epoch": 41.09492893490086, "grad_norm": 0.17043152451515198, "learning_rate": 0.001, "loss": 1.8204, "step": 702600 }, { "epoch": 41.10077791425396, "grad_norm": 0.2185995727777481, "learning_rate": 0.001, "loss": 1.8277, "step": 702700 }, { "epoch": 41.106626893607064, "grad_norm": 0.23992528021335602, "learning_rate": 0.001, "loss": 1.8273, "step": 702800 }, { "epoch": 41.11247587296017, "grad_norm": 0.18035796284675598, "learning_rate": 0.001, "loss": 1.8244, "step": 702900 }, { "epoch": 41.11832485231327, "grad_norm": 0.1911080777645111, "learning_rate": 0.001, "loss": 1.8273, "step": 703000 }, { "epoch": 41.124173831666376, "grad_norm": 0.1921652853488922, "learning_rate": 0.001, "loss": 1.8326, "step": 703100 }, { "epoch": 41.13002281101948, "grad_norm": 0.18209266662597656, "learning_rate": 0.001, "loss": 1.8217, "step": 703200 }, { "epoch": 41.13587179037258, "grad_norm": 0.21620996296405792, "learning_rate": 0.001, "loss": 1.8272, "step": 703300 }, { "epoch": 41.14172076972568, "grad_norm": 0.23774610459804535, "learning_rate": 0.001, "loss": 1.8261, "step": 703400 }, { "epoch": 41.147569749078784, "grad_norm": 0.1858484447002411, "learning_rate": 0.001, "loss": 1.8244, "step": 703500 }, { "epoch": 41.15341872843189, "grad_norm": 0.30093827843666077, "learning_rate": 0.001, "loss": 1.8324, "step": 703600 }, { "epoch": 41.15926770778499, "grad_norm": 0.20305435359477997, "learning_rate": 0.001, "loss": 1.8261, "step": 703700 }, { "epoch": 41.165116687138095, "grad_norm": 0.22853811085224152, "learning_rate": 0.001, "loss": 1.8282, "step": 703800 }, { "epoch": 41.1709656664912, "grad_norm": 0.1906748265028, "learning_rate": 0.001, "loss": 1.8275, "step": 703900 }, { "epoch": 41.1768146458443, "grad_norm": 0.18936164677143097, "learning_rate": 0.001, "loss": 1.8235, "step": 704000 }, { "epoch": 41.18266362519741, "grad_norm": 0.21857373416423798, "learning_rate": 0.001, "loss": 1.8318, "step": 704100 }, { "epoch": 41.1885126045505, "grad_norm": 0.19503973424434662, "learning_rate": 0.001, "loss": 1.8247, "step": 704200 }, { "epoch": 41.19436158390361, "grad_norm": 0.2293807715177536, "learning_rate": 0.001, "loss": 1.8287, "step": 704300 }, { "epoch": 41.20021056325671, "grad_norm": 0.32236045598983765, "learning_rate": 0.001, "loss": 1.8299, "step": 704400 }, { "epoch": 41.206059542609815, "grad_norm": 0.23563525080680847, "learning_rate": 0.001, "loss": 1.827, "step": 704500 }, { "epoch": 41.21190852196292, "grad_norm": 0.18175266683101654, "learning_rate": 0.001, "loss": 1.8224, "step": 704600 }, { "epoch": 41.21775750131602, "grad_norm": 0.20157381892204285, "learning_rate": 0.001, "loss": 1.8253, "step": 704700 }, { "epoch": 41.223606480669126, "grad_norm": 0.24299399554729462, "learning_rate": 0.001, "loss": 1.8288, "step": 704800 }, { "epoch": 41.22945546002222, "grad_norm": 0.23300553858280182, "learning_rate": 0.001, "loss": 1.8243, "step": 704900 }, { "epoch": 41.23530443937533, "grad_norm": 0.16607794165611267, "learning_rate": 0.001, "loss": 1.826, "step": 705000 }, { "epoch": 41.24115341872843, "grad_norm": 0.21724212169647217, "learning_rate": 0.001, "loss": 1.8335, "step": 705100 }, { "epoch": 41.247002398081534, "grad_norm": 0.24081067740917206, "learning_rate": 0.001, "loss": 1.8342, "step": 705200 }, { "epoch": 41.25285137743464, "grad_norm": 0.20845748484134674, "learning_rate": 0.001, "loss": 1.8301, "step": 705300 }, { "epoch": 41.25870035678774, "grad_norm": 0.19641976058483124, "learning_rate": 0.001, "loss": 1.8291, "step": 705400 }, { "epoch": 41.264549336140846, "grad_norm": 0.1718287616968155, "learning_rate": 0.001, "loss": 1.8259, "step": 705500 }, { "epoch": 41.27039831549395, "grad_norm": 0.20520564913749695, "learning_rate": 0.001, "loss": 1.8326, "step": 705600 }, { "epoch": 41.276247294847046, "grad_norm": 0.18291999399662018, "learning_rate": 0.001, "loss": 1.8281, "step": 705700 }, { "epoch": 41.28209627420015, "grad_norm": 0.17646124958992004, "learning_rate": 0.001, "loss": 1.8275, "step": 705800 }, { "epoch": 41.287945253553254, "grad_norm": 0.19588637351989746, "learning_rate": 0.001, "loss": 1.8294, "step": 705900 }, { "epoch": 41.29379423290636, "grad_norm": 0.2324225902557373, "learning_rate": 0.001, "loss": 1.8339, "step": 706000 }, { "epoch": 41.29964321225946, "grad_norm": 0.1786058098077774, "learning_rate": 0.001, "loss": 1.8266, "step": 706100 }, { "epoch": 41.305492191612565, "grad_norm": 0.19847112894058228, "learning_rate": 0.001, "loss": 1.83, "step": 706200 }, { "epoch": 41.31134117096567, "grad_norm": 0.21962065994739532, "learning_rate": 0.001, "loss": 1.836, "step": 706300 }, { "epoch": 41.31719015031877, "grad_norm": 0.15756329894065857, "learning_rate": 0.001, "loss": 1.8244, "step": 706400 }, { "epoch": 41.32303912967187, "grad_norm": 0.17355576157569885, "learning_rate": 0.001, "loss": 1.8229, "step": 706500 }, { "epoch": 41.32888810902497, "grad_norm": 0.196270152926445, "learning_rate": 0.001, "loss": 1.8281, "step": 706600 }, { "epoch": 41.33473708837808, "grad_norm": 0.20977169275283813, "learning_rate": 0.001, "loss": 1.8254, "step": 706700 }, { "epoch": 41.34058606773118, "grad_norm": 0.2123986929655075, "learning_rate": 0.001, "loss": 1.8244, "step": 706800 }, { "epoch": 41.346435047084285, "grad_norm": 0.2696133852005005, "learning_rate": 0.001, "loss": 1.8324, "step": 706900 }, { "epoch": 41.35228402643739, "grad_norm": 0.20210789144039154, "learning_rate": 0.001, "loss": 1.8321, "step": 707000 }, { "epoch": 41.35813300579049, "grad_norm": 0.1852167546749115, "learning_rate": 0.001, "loss": 1.8273, "step": 707100 }, { "epoch": 41.363981985143596, "grad_norm": 0.2054567188024521, "learning_rate": 0.001, "loss": 1.8273, "step": 707200 }, { "epoch": 41.36983096449669, "grad_norm": 0.1639585793018341, "learning_rate": 0.001, "loss": 1.8312, "step": 707300 }, { "epoch": 41.375679943849796, "grad_norm": 0.23786939680576324, "learning_rate": 0.001, "loss": 1.8326, "step": 707400 }, { "epoch": 41.3815289232029, "grad_norm": 0.1815141886472702, "learning_rate": 0.001, "loss": 1.8351, "step": 707500 }, { "epoch": 41.387377902556004, "grad_norm": 0.1831924468278885, "learning_rate": 0.001, "loss": 1.8314, "step": 707600 }, { "epoch": 41.39322688190911, "grad_norm": 0.29087525606155396, "learning_rate": 0.001, "loss": 1.8323, "step": 707700 }, { "epoch": 41.39907586126221, "grad_norm": 0.1905878335237503, "learning_rate": 0.001, "loss": 1.8295, "step": 707800 }, { "epoch": 41.404924840615315, "grad_norm": 0.19952106475830078, "learning_rate": 0.001, "loss": 1.832, "step": 707900 }, { "epoch": 41.41077381996841, "grad_norm": 0.21169550716876984, "learning_rate": 0.001, "loss": 1.8319, "step": 708000 }, { "epoch": 41.416622799321516, "grad_norm": 0.23918816447257996, "learning_rate": 0.001, "loss": 1.8308, "step": 708100 }, { "epoch": 41.42247177867462, "grad_norm": 0.1984093189239502, "learning_rate": 0.001, "loss": 1.8307, "step": 708200 }, { "epoch": 41.42832075802772, "grad_norm": 0.2389850914478302, "learning_rate": 0.001, "loss": 1.8308, "step": 708300 }, { "epoch": 41.43416973738083, "grad_norm": 0.24182792007923126, "learning_rate": 0.001, "loss": 1.8305, "step": 708400 }, { "epoch": 41.44001871673393, "grad_norm": 0.17416994273662567, "learning_rate": 0.001, "loss": 1.8275, "step": 708500 }, { "epoch": 41.445867696087035, "grad_norm": 0.15825921297073364, "learning_rate": 0.001, "loss": 1.8263, "step": 708600 }, { "epoch": 41.45171667544014, "grad_norm": 0.1880500167608261, "learning_rate": 0.001, "loss": 1.8327, "step": 708700 }, { "epoch": 41.457565654793235, "grad_norm": 0.21482619643211365, "learning_rate": 0.001, "loss": 1.8355, "step": 708800 }, { "epoch": 41.46341463414634, "grad_norm": 0.20005881786346436, "learning_rate": 0.001, "loss": 1.8243, "step": 708900 }, { "epoch": 41.46926361349944, "grad_norm": 0.20116384327411652, "learning_rate": 0.001, "loss": 1.8232, "step": 709000 }, { "epoch": 41.47511259285255, "grad_norm": 0.19081804156303406, "learning_rate": 0.001, "loss": 1.83, "step": 709100 }, { "epoch": 41.48096157220565, "grad_norm": 0.24329470098018646, "learning_rate": 0.001, "loss": 1.8303, "step": 709200 }, { "epoch": 41.486810551558754, "grad_norm": 0.20098786056041718, "learning_rate": 0.001, "loss": 1.8269, "step": 709300 }, { "epoch": 41.49265953091186, "grad_norm": 0.2193414270877838, "learning_rate": 0.001, "loss": 1.8294, "step": 709400 }, { "epoch": 41.49850851026496, "grad_norm": 0.1877298504114151, "learning_rate": 0.001, "loss": 1.8311, "step": 709500 }, { "epoch": 41.50435748961806, "grad_norm": 0.2531660795211792, "learning_rate": 0.001, "loss": 1.8316, "step": 709600 }, { "epoch": 41.51020646897116, "grad_norm": 0.2777736485004425, "learning_rate": 0.001, "loss": 1.8284, "step": 709700 }, { "epoch": 41.516055448324266, "grad_norm": 0.18066541850566864, "learning_rate": 0.001, "loss": 1.8338, "step": 709800 }, { "epoch": 41.52190442767737, "grad_norm": 0.20299038290977478, "learning_rate": 0.001, "loss": 1.8311, "step": 709900 }, { "epoch": 41.527753407030474, "grad_norm": 0.1800748109817505, "learning_rate": 0.001, "loss": 1.8275, "step": 710000 }, { "epoch": 41.53360238638358, "grad_norm": 0.22408519685268402, "learning_rate": 0.001, "loss": 1.8294, "step": 710100 }, { "epoch": 41.53945136573668, "grad_norm": 0.22342653572559357, "learning_rate": 0.001, "loss": 1.8314, "step": 710200 }, { "epoch": 41.545300345089785, "grad_norm": 0.18949361145496368, "learning_rate": 0.001, "loss": 1.8293, "step": 710300 }, { "epoch": 41.55114932444288, "grad_norm": 0.25933417677879333, "learning_rate": 0.001, "loss": 1.8298, "step": 710400 }, { "epoch": 41.556998303795986, "grad_norm": 0.17837032675743103, "learning_rate": 0.001, "loss": 1.8336, "step": 710500 }, { "epoch": 41.56284728314909, "grad_norm": 0.21155162155628204, "learning_rate": 0.001, "loss": 1.8267, "step": 710600 }, { "epoch": 41.56869626250219, "grad_norm": 0.1866372674703598, "learning_rate": 0.001, "loss": 1.8347, "step": 710700 }, { "epoch": 41.5745452418553, "grad_norm": 0.255376935005188, "learning_rate": 0.001, "loss": 1.8324, "step": 710800 }, { "epoch": 41.5803942212084, "grad_norm": 0.23952074348926544, "learning_rate": 0.001, "loss": 1.8351, "step": 710900 }, { "epoch": 41.586243200561505, "grad_norm": 0.1838168501853943, "learning_rate": 0.001, "loss": 1.8321, "step": 711000 }, { "epoch": 41.5920921799146, "grad_norm": 0.19243720173835754, "learning_rate": 0.001, "loss": 1.83, "step": 711100 }, { "epoch": 41.597941159267705, "grad_norm": 0.21830306947231293, "learning_rate": 0.001, "loss": 1.8329, "step": 711200 }, { "epoch": 41.60379013862081, "grad_norm": 0.2085135281085968, "learning_rate": 0.001, "loss": 1.8279, "step": 711300 }, { "epoch": 41.60963911797391, "grad_norm": 0.19994840025901794, "learning_rate": 0.001, "loss": 1.8323, "step": 711400 }, { "epoch": 41.61548809732702, "grad_norm": 0.2939547300338745, "learning_rate": 0.001, "loss": 1.8305, "step": 711500 }, { "epoch": 41.62133707668012, "grad_norm": 0.2552366256713867, "learning_rate": 0.001, "loss": 1.8317, "step": 711600 }, { "epoch": 41.627186056033224, "grad_norm": 0.2991199791431427, "learning_rate": 0.001, "loss": 1.8366, "step": 711700 }, { "epoch": 41.63303503538633, "grad_norm": 0.15687468647956848, "learning_rate": 0.001, "loss": 1.8379, "step": 711800 }, { "epoch": 41.638884014739425, "grad_norm": 0.20649172365665436, "learning_rate": 0.001, "loss": 1.8297, "step": 711900 }, { "epoch": 41.64473299409253, "grad_norm": 0.15622453391551971, "learning_rate": 0.001, "loss": 1.8375, "step": 712000 }, { "epoch": 41.65058197344563, "grad_norm": 0.18547214567661285, "learning_rate": 0.001, "loss": 1.8297, "step": 712100 }, { "epoch": 41.656430952798736, "grad_norm": 0.21544457972049713, "learning_rate": 0.001, "loss": 1.8366, "step": 712200 }, { "epoch": 41.66227993215184, "grad_norm": 0.19402997195720673, "learning_rate": 0.001, "loss": 1.8248, "step": 712300 }, { "epoch": 41.668128911504944, "grad_norm": 0.23835311830043793, "learning_rate": 0.001, "loss": 1.8372, "step": 712400 }, { "epoch": 41.67397789085805, "grad_norm": 0.17297236621379852, "learning_rate": 0.001, "loss": 1.8318, "step": 712500 }, { "epoch": 41.67982687021115, "grad_norm": 0.21860842406749725, "learning_rate": 0.001, "loss": 1.834, "step": 712600 }, { "epoch": 41.68567584956425, "grad_norm": 0.18761128187179565, "learning_rate": 0.001, "loss": 1.8305, "step": 712700 }, { "epoch": 41.69152482891735, "grad_norm": 0.18695862591266632, "learning_rate": 0.001, "loss": 1.8322, "step": 712800 }, { "epoch": 41.697373808270456, "grad_norm": 0.18300293385982513, "learning_rate": 0.001, "loss": 1.829, "step": 712900 }, { "epoch": 41.70322278762356, "grad_norm": 0.2412615716457367, "learning_rate": 0.001, "loss": 1.8324, "step": 713000 }, { "epoch": 41.70907176697666, "grad_norm": 0.2287134975194931, "learning_rate": 0.001, "loss": 1.8287, "step": 713100 }, { "epoch": 41.71492074632977, "grad_norm": 0.2170328050851822, "learning_rate": 0.001, "loss": 1.8293, "step": 713200 }, { "epoch": 41.72076972568287, "grad_norm": 0.18908987939357758, "learning_rate": 0.001, "loss": 1.8299, "step": 713300 }, { "epoch": 41.726618705035975, "grad_norm": 0.22458107769489288, "learning_rate": 0.001, "loss": 1.8298, "step": 713400 }, { "epoch": 41.73246768438907, "grad_norm": 0.27685263752937317, "learning_rate": 0.001, "loss": 1.8445, "step": 713500 }, { "epoch": 41.738316663742175, "grad_norm": 0.198569193482399, "learning_rate": 0.001, "loss": 1.8359, "step": 713600 }, { "epoch": 41.74416564309528, "grad_norm": 0.22503507137298584, "learning_rate": 0.001, "loss": 1.8311, "step": 713700 }, { "epoch": 41.75001462244838, "grad_norm": 0.21648748219013214, "learning_rate": 0.001, "loss": 1.8311, "step": 713800 }, { "epoch": 41.75586360180149, "grad_norm": 0.20489895343780518, "learning_rate": 0.001, "loss": 1.8328, "step": 713900 }, { "epoch": 41.76171258115459, "grad_norm": 0.16908879578113556, "learning_rate": 0.001, "loss": 1.834, "step": 714000 }, { "epoch": 41.767561560507694, "grad_norm": 0.24625787138938904, "learning_rate": 0.001, "loss": 1.8261, "step": 714100 }, { "epoch": 41.77341053986079, "grad_norm": 0.16065490245819092, "learning_rate": 0.001, "loss": 1.8298, "step": 714200 }, { "epoch": 41.779259519213895, "grad_norm": 0.2665110230445862, "learning_rate": 0.001, "loss": 1.8355, "step": 714300 }, { "epoch": 41.785108498567, "grad_norm": 0.21977536380290985, "learning_rate": 0.001, "loss": 1.8357, "step": 714400 }, { "epoch": 41.7909574779201, "grad_norm": 0.25019946694374084, "learning_rate": 0.001, "loss": 1.8319, "step": 714500 }, { "epoch": 41.796806457273206, "grad_norm": 0.2240883857011795, "learning_rate": 0.001, "loss": 1.83, "step": 714600 }, { "epoch": 41.80265543662631, "grad_norm": 0.23982028663158417, "learning_rate": 0.001, "loss": 1.8367, "step": 714700 }, { "epoch": 41.808504415979414, "grad_norm": 0.21595729887485504, "learning_rate": 0.001, "loss": 1.8337, "step": 714800 }, { "epoch": 41.81435339533252, "grad_norm": 0.17666158080101013, "learning_rate": 0.001, "loss": 1.8363, "step": 714900 }, { "epoch": 41.820202374685614, "grad_norm": 0.18924178183078766, "learning_rate": 0.001, "loss": 1.8347, "step": 715000 }, { "epoch": 41.82605135403872, "grad_norm": 0.18658983707427979, "learning_rate": 0.001, "loss": 1.8291, "step": 715100 }, { "epoch": 41.83190033339182, "grad_norm": 0.19003449380397797, "learning_rate": 0.001, "loss": 1.8312, "step": 715200 }, { "epoch": 41.837749312744926, "grad_norm": 0.1677178144454956, "learning_rate": 0.001, "loss": 1.8327, "step": 715300 }, { "epoch": 41.84359829209803, "grad_norm": 0.17605601251125336, "learning_rate": 0.001, "loss": 1.841, "step": 715400 }, { "epoch": 41.84944727145113, "grad_norm": 0.16717509925365448, "learning_rate": 0.001, "loss": 1.8337, "step": 715500 }, { "epoch": 41.85529625080424, "grad_norm": 0.26267915964126587, "learning_rate": 0.001, "loss": 1.8347, "step": 715600 }, { "epoch": 41.86114523015734, "grad_norm": 0.20621994137763977, "learning_rate": 0.001, "loss": 1.8343, "step": 715700 }, { "epoch": 41.86699420951044, "grad_norm": 0.3353482484817505, "learning_rate": 0.001, "loss": 1.8303, "step": 715800 }, { "epoch": 41.87284318886354, "grad_norm": 0.19913844764232635, "learning_rate": 0.001, "loss": 1.8303, "step": 715900 }, { "epoch": 41.878692168216645, "grad_norm": 0.1786254346370697, "learning_rate": 0.001, "loss": 1.8374, "step": 716000 }, { "epoch": 41.88454114756975, "grad_norm": 0.26217353343963623, "learning_rate": 0.001, "loss": 1.8323, "step": 716100 }, { "epoch": 41.89039012692285, "grad_norm": 0.2295835316181183, "learning_rate": 0.001, "loss": 1.8357, "step": 716200 }, { "epoch": 41.89623910627596, "grad_norm": 0.2201710343360901, "learning_rate": 0.001, "loss": 1.8367, "step": 716300 }, { "epoch": 41.90208808562906, "grad_norm": 0.20379701256752014, "learning_rate": 0.001, "loss": 1.8391, "step": 716400 }, { "epoch": 41.907937064982164, "grad_norm": 0.22888723015785217, "learning_rate": 0.001, "loss": 1.8354, "step": 716500 }, { "epoch": 41.91378604433526, "grad_norm": 0.22020356357097626, "learning_rate": 0.001, "loss": 1.8344, "step": 716600 }, { "epoch": 41.919635023688365, "grad_norm": 0.20295147597789764, "learning_rate": 0.001, "loss": 1.8337, "step": 716700 }, { "epoch": 41.92548400304147, "grad_norm": 0.1644425243139267, "learning_rate": 0.001, "loss": 1.8325, "step": 716800 }, { "epoch": 41.93133298239457, "grad_norm": 0.31579601764678955, "learning_rate": 0.001, "loss": 1.829, "step": 716900 }, { "epoch": 41.937181961747676, "grad_norm": 0.31125718355178833, "learning_rate": 0.001, "loss": 1.8377, "step": 717000 }, { "epoch": 41.94303094110078, "grad_norm": 0.21462740004062653, "learning_rate": 0.001, "loss": 1.8327, "step": 717100 }, { "epoch": 41.948879920453884, "grad_norm": 0.23650296032428741, "learning_rate": 0.001, "loss": 1.8345, "step": 717200 }, { "epoch": 41.95472889980698, "grad_norm": 0.24992652237415314, "learning_rate": 0.001, "loss": 1.8386, "step": 717300 }, { "epoch": 41.960577879160084, "grad_norm": 0.24181680381298065, "learning_rate": 0.001, "loss": 1.8318, "step": 717400 }, { "epoch": 41.96642685851319, "grad_norm": 0.16668300330638885, "learning_rate": 0.001, "loss": 1.832, "step": 717500 }, { "epoch": 41.97227583786629, "grad_norm": 0.18997235596179962, "learning_rate": 0.001, "loss": 1.8249, "step": 717600 }, { "epoch": 41.978124817219395, "grad_norm": 0.18742063641548157, "learning_rate": 0.001, "loss": 1.8356, "step": 717700 }, { "epoch": 41.9839737965725, "grad_norm": 0.23729820549488068, "learning_rate": 0.001, "loss": 1.8375, "step": 717800 }, { "epoch": 41.9898227759256, "grad_norm": 0.20779241621494293, "learning_rate": 0.001, "loss": 1.8271, "step": 717900 }, { "epoch": 41.99567175527871, "grad_norm": 0.218163400888443, "learning_rate": 0.001, "loss": 1.8331, "step": 718000 }, { "epoch": 42.001520734631804, "grad_norm": 0.2102748602628708, "learning_rate": 0.001, "loss": 1.8296, "step": 718100 }, { "epoch": 42.00736971398491, "grad_norm": 0.2960781455039978, "learning_rate": 0.001, "loss": 1.819, "step": 718200 }, { "epoch": 42.01321869333801, "grad_norm": 0.256256103515625, "learning_rate": 0.001, "loss": 1.8247, "step": 718300 }, { "epoch": 42.019067672691115, "grad_norm": 0.1844290941953659, "learning_rate": 0.001, "loss": 1.821, "step": 718400 }, { "epoch": 42.02491665204422, "grad_norm": 0.22593532502651215, "learning_rate": 0.001, "loss": 1.8173, "step": 718500 }, { "epoch": 42.03076563139732, "grad_norm": 0.21930383145809174, "learning_rate": 0.001, "loss": 1.8178, "step": 718600 }, { "epoch": 42.036614610750426, "grad_norm": 0.3606012165546417, "learning_rate": 0.001, "loss": 1.8232, "step": 718700 }, { "epoch": 42.04246359010353, "grad_norm": 0.1699526309967041, "learning_rate": 0.001, "loss": 1.82, "step": 718800 }, { "epoch": 42.04831256945663, "grad_norm": 0.22838394343852997, "learning_rate": 0.001, "loss": 1.8185, "step": 718900 }, { "epoch": 42.05416154880973, "grad_norm": 0.16841228306293488, "learning_rate": 0.001, "loss": 1.8223, "step": 719000 }, { "epoch": 42.060010528162834, "grad_norm": 0.24783720076084137, "learning_rate": 0.001, "loss": 1.8254, "step": 719100 }, { "epoch": 42.06585950751594, "grad_norm": 0.16354091465473175, "learning_rate": 0.001, "loss": 1.8241, "step": 719200 }, { "epoch": 42.07170848686904, "grad_norm": 0.19328220188617706, "learning_rate": 0.001, "loss": 1.8201, "step": 719300 }, { "epoch": 42.077557466222146, "grad_norm": 0.13841843605041504, "learning_rate": 0.001, "loss": 1.8153, "step": 719400 }, { "epoch": 42.08340644557525, "grad_norm": 0.18523263931274414, "learning_rate": 0.001, "loss": 1.8225, "step": 719500 }, { "epoch": 42.08925542492835, "grad_norm": 0.29188022017478943, "learning_rate": 0.001, "loss": 1.82, "step": 719600 }, { "epoch": 42.09510440428145, "grad_norm": 0.17516712844371796, "learning_rate": 0.001, "loss": 1.8217, "step": 719700 }, { "epoch": 42.100953383634554, "grad_norm": 0.19156306982040405, "learning_rate": 0.001, "loss": 1.8205, "step": 719800 }, { "epoch": 42.10680236298766, "grad_norm": 0.18349295854568481, "learning_rate": 0.001, "loss": 1.817, "step": 719900 }, { "epoch": 42.11265134234076, "grad_norm": 0.19126123189926147, "learning_rate": 0.001, "loss": 1.8313, "step": 720000 }, { "epoch": 42.118500321693865, "grad_norm": 0.16840793192386627, "learning_rate": 0.001, "loss": 1.8294, "step": 720100 }, { "epoch": 42.12434930104697, "grad_norm": 0.16830004751682281, "learning_rate": 0.001, "loss": 1.8222, "step": 720200 }, { "epoch": 42.13019828040007, "grad_norm": 0.18372514843940735, "learning_rate": 0.001, "loss": 1.8282, "step": 720300 }, { "epoch": 42.13604725975317, "grad_norm": 0.28750333189964294, "learning_rate": 0.001, "loss": 1.8253, "step": 720400 }, { "epoch": 42.14189623910627, "grad_norm": 0.21324971318244934, "learning_rate": 0.001, "loss": 1.8306, "step": 720500 }, { "epoch": 42.14774521845938, "grad_norm": 0.19182927906513214, "learning_rate": 0.001, "loss": 1.8277, "step": 720600 }, { "epoch": 42.15359419781248, "grad_norm": 0.14611297845840454, "learning_rate": 0.001, "loss": 1.8203, "step": 720700 }, { "epoch": 42.159443177165585, "grad_norm": 0.21016262471675873, "learning_rate": 0.001, "loss": 1.8286, "step": 720800 }, { "epoch": 42.16529215651869, "grad_norm": 0.21911852061748505, "learning_rate": 0.001, "loss": 1.8275, "step": 720900 }, { "epoch": 42.17114113587179, "grad_norm": 0.19834068417549133, "learning_rate": 0.001, "loss": 1.8217, "step": 721000 }, { "epoch": 42.176990115224896, "grad_norm": 0.3016441762447357, "learning_rate": 0.001, "loss": 1.8245, "step": 721100 }, { "epoch": 42.18283909457799, "grad_norm": 0.2057637870311737, "learning_rate": 0.001, "loss": 1.829, "step": 721200 }, { "epoch": 42.1886880739311, "grad_norm": 0.24227848649024963, "learning_rate": 0.001, "loss": 1.8182, "step": 721300 }, { "epoch": 42.1945370532842, "grad_norm": 0.16927039623260498, "learning_rate": 0.001, "loss": 1.8207, "step": 721400 }, { "epoch": 42.200386032637304, "grad_norm": 0.15736466646194458, "learning_rate": 0.001, "loss": 1.8253, "step": 721500 }, { "epoch": 42.20623501199041, "grad_norm": 0.17769138514995575, "learning_rate": 0.001, "loss": 1.8226, "step": 721600 }, { "epoch": 42.21208399134351, "grad_norm": 0.22853073477745056, "learning_rate": 0.001, "loss": 1.8254, "step": 721700 }, { "epoch": 42.217932970696616, "grad_norm": 0.24799759685993195, "learning_rate": 0.001, "loss": 1.8248, "step": 721800 }, { "epoch": 42.22378195004972, "grad_norm": 0.2562348544597626, "learning_rate": 0.001, "loss": 1.8283, "step": 721900 }, { "epoch": 42.229630929402816, "grad_norm": 0.13926354050636292, "learning_rate": 0.001, "loss": 1.8238, "step": 722000 }, { "epoch": 42.23547990875592, "grad_norm": 0.21962811052799225, "learning_rate": 0.001, "loss": 1.8191, "step": 722100 }, { "epoch": 42.241328888109024, "grad_norm": 0.20329485833644867, "learning_rate": 0.001, "loss": 1.8222, "step": 722200 }, { "epoch": 42.24717786746213, "grad_norm": 0.2697356939315796, "learning_rate": 0.001, "loss": 1.8208, "step": 722300 }, { "epoch": 42.25302684681523, "grad_norm": 0.18155886232852936, "learning_rate": 0.001, "loss": 1.8199, "step": 722400 }, { "epoch": 42.258875826168335, "grad_norm": 0.19294433295726776, "learning_rate": 0.001, "loss": 1.8265, "step": 722500 }, { "epoch": 42.26472480552144, "grad_norm": 0.2196694016456604, "learning_rate": 0.001, "loss": 1.8272, "step": 722600 }, { "epoch": 42.27057378487454, "grad_norm": 0.1392340362071991, "learning_rate": 0.001, "loss": 1.828, "step": 722700 }, { "epoch": 42.27642276422764, "grad_norm": 0.16419735550880432, "learning_rate": 0.001, "loss": 1.8243, "step": 722800 }, { "epoch": 42.28227174358074, "grad_norm": 0.20246157050132751, "learning_rate": 0.001, "loss": 1.8293, "step": 722900 }, { "epoch": 42.28812072293385, "grad_norm": 0.24182352423667908, "learning_rate": 0.001, "loss": 1.8255, "step": 723000 }, { "epoch": 42.29396970228695, "grad_norm": 0.16991311311721802, "learning_rate": 0.001, "loss": 1.821, "step": 723100 }, { "epoch": 42.299818681640055, "grad_norm": 0.3051963746547699, "learning_rate": 0.001, "loss": 1.8269, "step": 723200 }, { "epoch": 42.30566766099316, "grad_norm": 0.195326030254364, "learning_rate": 0.001, "loss": 1.8279, "step": 723300 }, { "epoch": 42.31151664034626, "grad_norm": 0.15689310431480408, "learning_rate": 0.001, "loss": 1.8215, "step": 723400 }, { "epoch": 42.31736561969936, "grad_norm": 0.18880034983158112, "learning_rate": 0.001, "loss": 1.8237, "step": 723500 }, { "epoch": 42.32321459905246, "grad_norm": 0.1625380665063858, "learning_rate": 0.001, "loss": 1.8258, "step": 723600 }, { "epoch": 42.32906357840557, "grad_norm": 0.19700965285301208, "learning_rate": 0.001, "loss": 1.8267, "step": 723700 }, { "epoch": 42.33491255775867, "grad_norm": 0.15842528641223907, "learning_rate": 0.001, "loss": 1.8255, "step": 723800 }, { "epoch": 42.340761537111774, "grad_norm": 0.1914348006248474, "learning_rate": 0.001, "loss": 1.8227, "step": 723900 }, { "epoch": 42.34661051646488, "grad_norm": 0.19790494441986084, "learning_rate": 0.001, "loss": 1.822, "step": 724000 }, { "epoch": 42.35245949581798, "grad_norm": 0.17336153984069824, "learning_rate": 0.001, "loss": 1.8213, "step": 724100 }, { "epoch": 42.358308475171086, "grad_norm": 0.1795332431793213, "learning_rate": 0.001, "loss": 1.8203, "step": 724200 }, { "epoch": 42.36415745452418, "grad_norm": 0.15755100548267365, "learning_rate": 0.001, "loss": 1.8281, "step": 724300 }, { "epoch": 42.370006433877286, "grad_norm": 0.22343668341636658, "learning_rate": 0.001, "loss": 1.8237, "step": 724400 }, { "epoch": 42.37585541323039, "grad_norm": 0.1780492067337036, "learning_rate": 0.001, "loss": 1.8263, "step": 724500 }, { "epoch": 42.381704392583494, "grad_norm": 0.1432356834411621, "learning_rate": 0.001, "loss": 1.8257, "step": 724600 }, { "epoch": 42.3875533719366, "grad_norm": 0.1868237853050232, "learning_rate": 0.001, "loss": 1.8276, "step": 724700 }, { "epoch": 42.3934023512897, "grad_norm": 0.4120250940322876, "learning_rate": 0.001, "loss": 1.8286, "step": 724800 }, { "epoch": 42.399251330642805, "grad_norm": 0.18785040080547333, "learning_rate": 0.001, "loss": 1.8268, "step": 724900 }, { "epoch": 42.40510030999591, "grad_norm": 0.18344977498054504, "learning_rate": 0.001, "loss": 1.8277, "step": 725000 }, { "epoch": 42.40510030999591, "eval_ag_news_accuracy": 0.238515625, "eval_ag_news_bleu_score": 6.921813555934851, "eval_ag_news_bleu_score_sem": 0.4890313742113381, "eval_ag_news_emb_cos_sim": 0.72220778465271, "eval_ag_news_emb_cos_sim_sem": 0.013933719135820866, "eval_ag_news_emb_top1_equal": 0.9765625, "eval_ag_news_emb_top1_equal_sem": 0.013424675911664963, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7057573795318604, "eval_ag_news_n_ngrams_match_1": 14.578125, "eval_ag_news_n_ngrams_match_2": 4.390625, "eval_ag_news_n_ngrams_match_3": 1.6640625, "eval_ag_news_num_pred_words": 47.109375, "eval_ag_news_num_true_words": 45.1640625, "eval_ag_news_perplexity": 14.965647074059087, "eval_ag_news_pred_num_tokens": 69.1953125, "eval_ag_news_rouge_score": 0.3096122838950301, "eval_ag_news_runtime": 36.8186, "eval_ag_news_samples_per_second": 13.58, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.34060285096250864, "eval_ag_news_token_set_f1_sem": 0.010384316030946392, "eval_ag_news_token_set_precision": 0.3255203451260911, "eval_ag_news_token_set_recall": 0.365714434532943, "eval_ag_news_true_num_tokens": 61.6796875, "step": 725000 }, { "epoch": 42.40510030999591, "eval_anthropic_toxic_prompts_accuracy": 0.105015625, "eval_anthropic_toxic_prompts_bleu_score": 44.63584585518247, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.5871952091194292, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8924176692962646, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009515105746686459, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02934655810211727, "eval_anthropic_toxic_prompts_loss": 1.2482348680496216, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 10.171875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.2890625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.1953125, "eval_anthropic_toxic_prompts_num_pred_words": 16.34375, "eval_anthropic_toxic_prompts_num_true_words": 16.03125, "eval_anthropic_toxic_prompts_perplexity": 3.4841874758067317, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.390625, "eval_anthropic_toxic_prompts_rouge_score": 0.6972699770629543, "eval_anthropic_toxic_prompts_runtime": 27.8931, "eval_anthropic_toxic_prompts_samples_per_second": 17.926, "eval_anthropic_toxic_prompts_steps_per_second": 0.036, "eval_anthropic_toxic_prompts_token_set_f1": 0.7199148005843572, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017890475582561807, "eval_anthropic_toxic_prompts_token_set_precision": 0.7190145744999458, "eval_anthropic_toxic_prompts_token_set_recall": 0.7306096797845221, "eval_anthropic_toxic_prompts_true_num_tokens": 19.265625, "step": 725000 }, { "epoch": 42.40510030999591, "eval_arxiv_accuracy": 0.377171875, "eval_arxiv_bleu_score": 1.6939973180479764, "eval_arxiv_bleu_score_sem": 0.1714849383461093, "eval_arxiv_emb_cos_sim": 0.4595220685005188, "eval_arxiv_emb_cos_sim_sem": 0.018429826945066452, "eval_arxiv_emb_top1_equal": 0.9140625, "eval_arxiv_emb_top1_equal_sem": 0.024870097637176514, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.42946195602417, "eval_arxiv_n_ngrams_match_1": 13.15625, "eval_arxiv_n_ngrams_match_2": 2.390625, "eval_arxiv_n_ngrams_match_3": 0.515625, "eval_arxiv_num_pred_words": 55.46875, "eval_arxiv_num_true_words": 87.7734375, "eval_arxiv_perplexity": 30.860034226510308, "eval_arxiv_pred_num_tokens": 126.234375, "eval_arxiv_rouge_score": 0.17183433240224644, "eval_arxiv_runtime": 32.0477, "eval_arxiv_samples_per_second": 15.602, "eval_arxiv_steps_per_second": 0.031, "eval_arxiv_token_set_f1": 0.17844567162030528, "eval_arxiv_token_set_f1_sem": 0.008568117054980073, "eval_arxiv_token_set_precision": 0.12115723345206814, "eval_arxiv_token_set_recall": 0.43650838097184086, "eval_arxiv_true_num_tokens": 126.0546875, "step": 725000 }, { "epoch": 42.40510030999591, "eval_python_code_alpaca_accuracy": 0.130359375, "eval_python_code_alpaca_bleu_score": 25.196604837515387, "eval_python_code_alpaca_bleu_score_sem": 1.373815433696386, "eval_python_code_alpaca_emb_cos_sim": 0.8638708591461182, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009767187759280205, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4747440814971924, "eval_python_code_alpaca_n_ngrams_match_1": 10.265625, "eval_python_code_alpaca_n_ngrams_match_2": 5.390625, "eval_python_code_alpaca_n_ngrams_match_3": 3.0625, "eval_python_code_alpaca_num_pred_words": 18.3359375, "eval_python_code_alpaca_num_true_words": 19.0546875, "eval_python_code_alpaca_perplexity": 4.369917287125742, "eval_python_code_alpaca_pred_num_tokens": 24.4609375, "eval_python_code_alpaca_rouge_score": 0.5572075918030817, "eval_python_code_alpaca_runtime": 31.7658, "eval_python_code_alpaca_samples_per_second": 15.74, "eval_python_code_alpaca_steps_per_second": 0.031, "eval_python_code_alpaca_token_set_f1": 0.5804444630917017, "eval_python_code_alpaca_token_set_f1_sem": 0.012318775296493055, "eval_python_code_alpaca_token_set_precision": 0.5650498368295988, "eval_python_code_alpaca_token_set_recall": 0.6043442828658236, "eval_python_code_alpaca_true_num_tokens": 24.6171875, "step": 725000 }, { "epoch": 42.40510030999591, "eval_wikibio_accuracy": 0.3669375, "eval_wikibio_bleu_score": 7.374826616645143, "eval_wikibio_bleu_score_sem": 0.6769734872272828, "eval_wikibio_emb_cos_sim": 0.5943082571029663, "eval_wikibio_emb_cos_sim_sem": 0.02357296645641327, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6877105236053467, "eval_wikibio_n_ngrams_match_1": 15.1640625, "eval_wikibio_n_ngrams_match_2": 5.03125, "eval_wikibio_n_ngrams_match_3": 2.1640625, "eval_wikibio_num_pred_words": 52.6640625, "eval_wikibio_num_true_words": 52.1171875, "eval_wikibio_perplexity": 14.69798667624945, "eval_wikibio_pred_num_tokens": 106.8515625, "eval_wikibio_rouge_score": 0.2986403696462778, "eval_wikibio_runtime": 31.7094, "eval_wikibio_samples_per_second": 15.768, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.3164720158269328, "eval_wikibio_token_set_f1_sem": 0.012393192632851442, "eval_wikibio_token_set_precision": 0.2804432538955134, "eval_wikibio_token_set_recall": 0.3996973575443436, "eval_wikibio_true_num_tokens": 100.2578125, "step": 725000 }, { "epoch": 42.40510030999591, "eval_msmarco_accuracy": 0.401671875, "eval_msmarco_bleu_score": 16.84789925074682, "eval_msmarco_bleu_score_sem": 1.4461375957968357, "eval_msmarco_emb_cos_sim": 0.8044129610061646, "eval_msmarco_emb_cos_sim_sem": 0.014850286766886711, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7031315565109253, "eval_msmarco_n_ngrams_match_1": 27.3203125, "eval_msmarco_n_ngrams_match_2": 12.421875, "eval_msmarco_n_ngrams_match_3": 7.0546875, "eval_msmarco_num_pred_words": 60.3984375, "eval_msmarco_num_true_words": 60.265625, "eval_msmarco_perplexity": 5.491116235894364, "eval_msmarco_pred_num_tokens": 83.515625, "eval_msmarco_rouge_score": 0.4335298728941299, "eval_msmarco_runtime": 25.9555, "eval_msmarco_samples_per_second": 19.264, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4630438051673777, "eval_msmarco_token_set_f1_sem": 0.014250923463164217, "eval_msmarco_token_set_precision": 0.4282117259035924, "eval_msmarco_token_set_recall": 0.5203554345615802, "eval_msmarco_true_num_tokens": 77.140625, "step": 725000 }, { "epoch": 42.410949289349006, "grad_norm": 0.21542808413505554, "learning_rate": 0.001, "loss": 1.8243, "step": 725100 }, { "epoch": 42.41679826870211, "grad_norm": 0.16282004117965698, "learning_rate": 0.001, "loss": 1.8317, "step": 725200 }, { "epoch": 42.42264724805521, "grad_norm": 0.26654040813446045, "learning_rate": 0.001, "loss": 1.8269, "step": 725300 }, { "epoch": 42.42849622740832, "grad_norm": 0.19002078473567963, "learning_rate": 0.001, "loss": 1.8288, "step": 725400 }, { "epoch": 42.43434520676142, "grad_norm": 0.1759246438741684, "learning_rate": 0.001, "loss": 1.8355, "step": 725500 }, { "epoch": 42.440194186114525, "grad_norm": 0.2664033770561218, "learning_rate": 0.001, "loss": 1.8278, "step": 725600 }, { "epoch": 42.44604316546763, "grad_norm": 0.15429846942424774, "learning_rate": 0.001, "loss": 1.8249, "step": 725700 }, { "epoch": 42.45189214482073, "grad_norm": 0.20380237698554993, "learning_rate": 0.001, "loss": 1.8283, "step": 725800 }, { "epoch": 42.45774112417383, "grad_norm": 0.20340487360954285, "learning_rate": 0.001, "loss": 1.8265, "step": 725900 }, { "epoch": 42.46359010352693, "grad_norm": 0.262198269367218, "learning_rate": 0.001, "loss": 1.8236, "step": 726000 }, { "epoch": 42.46943908288004, "grad_norm": 0.2088189274072647, "learning_rate": 0.001, "loss": 1.827, "step": 726100 }, { "epoch": 42.47528806223314, "grad_norm": 0.2081323117017746, "learning_rate": 0.001, "loss": 1.8264, "step": 726200 }, { "epoch": 42.481137041586244, "grad_norm": 0.2780885696411133, "learning_rate": 0.001, "loss": 1.8273, "step": 726300 }, { "epoch": 42.48698602093935, "grad_norm": 0.16556616127490997, "learning_rate": 0.001, "loss": 1.8274, "step": 726400 }, { "epoch": 42.49283500029245, "grad_norm": 0.19550290703773499, "learning_rate": 0.001, "loss": 1.824, "step": 726500 }, { "epoch": 42.49868397964555, "grad_norm": 0.18505887687206268, "learning_rate": 0.001, "loss": 1.8292, "step": 726600 }, { "epoch": 42.50453295899865, "grad_norm": 0.24621616303920746, "learning_rate": 0.001, "loss": 1.8299, "step": 726700 }, { "epoch": 42.510381938351756, "grad_norm": 0.16765369474887848, "learning_rate": 0.001, "loss": 1.8295, "step": 726800 }, { "epoch": 42.51623091770486, "grad_norm": 0.19822652637958527, "learning_rate": 0.001, "loss": 1.8227, "step": 726900 }, { "epoch": 42.522079897057964, "grad_norm": 0.16930381953716278, "learning_rate": 0.001, "loss": 1.8296, "step": 727000 }, { "epoch": 42.52792887641107, "grad_norm": 0.23322246968746185, "learning_rate": 0.001, "loss": 1.825, "step": 727100 }, { "epoch": 42.53377785576417, "grad_norm": 0.16193021833896637, "learning_rate": 0.001, "loss": 1.8283, "step": 727200 }, { "epoch": 42.539626835117275, "grad_norm": 0.2063484489917755, "learning_rate": 0.001, "loss": 1.828, "step": 727300 }, { "epoch": 42.54547581447037, "grad_norm": 0.2424866259098053, "learning_rate": 0.001, "loss": 1.8354, "step": 727400 }, { "epoch": 42.551324793823476, "grad_norm": 0.1478533148765564, "learning_rate": 0.001, "loss": 1.8289, "step": 727500 }, { "epoch": 42.55717377317658, "grad_norm": 0.1825922727584839, "learning_rate": 0.001, "loss": 1.8276, "step": 727600 }, { "epoch": 42.56302275252968, "grad_norm": 0.2125585675239563, "learning_rate": 0.001, "loss": 1.8299, "step": 727700 }, { "epoch": 42.56887173188279, "grad_norm": 0.1570470631122589, "learning_rate": 0.001, "loss": 1.828, "step": 727800 }, { "epoch": 42.57472071123589, "grad_norm": 0.2265428602695465, "learning_rate": 0.001, "loss": 1.8347, "step": 727900 }, { "epoch": 42.580569690588995, "grad_norm": 0.22837066650390625, "learning_rate": 0.001, "loss": 1.8295, "step": 728000 }, { "epoch": 42.5864186699421, "grad_norm": 0.16292063891887665, "learning_rate": 0.001, "loss": 1.8209, "step": 728100 }, { "epoch": 42.592267649295195, "grad_norm": 0.19160330295562744, "learning_rate": 0.001, "loss": 1.8277, "step": 728200 }, { "epoch": 42.5981166286483, "grad_norm": 0.24248313903808594, "learning_rate": 0.001, "loss": 1.8257, "step": 728300 }, { "epoch": 42.6039656080014, "grad_norm": 0.21584554016590118, "learning_rate": 0.001, "loss": 1.8264, "step": 728400 }, { "epoch": 42.609814587354506, "grad_norm": 0.193873330950737, "learning_rate": 0.001, "loss": 1.8277, "step": 728500 }, { "epoch": 42.61566356670761, "grad_norm": 0.17287307977676392, "learning_rate": 0.001, "loss": 1.8296, "step": 728600 }, { "epoch": 42.621512546060714, "grad_norm": 0.1848331242799759, "learning_rate": 0.001, "loss": 1.8336, "step": 728700 }, { "epoch": 42.62736152541382, "grad_norm": 0.20421205461025238, "learning_rate": 0.001, "loss": 1.8265, "step": 728800 }, { "epoch": 42.63321050476692, "grad_norm": 0.17422263324260712, "learning_rate": 0.001, "loss": 1.8307, "step": 728900 }, { "epoch": 42.63905948412002, "grad_norm": 0.17191877961158752, "learning_rate": 0.001, "loss": 1.8292, "step": 729000 }, { "epoch": 42.64490846347312, "grad_norm": 0.1838715523481369, "learning_rate": 0.001, "loss": 1.8353, "step": 729100 }, { "epoch": 42.650757442826226, "grad_norm": 0.15230581164360046, "learning_rate": 0.001, "loss": 1.8348, "step": 729200 }, { "epoch": 42.65660642217933, "grad_norm": 0.2007419615983963, "learning_rate": 0.001, "loss": 1.8272, "step": 729300 }, { "epoch": 42.66245540153243, "grad_norm": 0.20323660969734192, "learning_rate": 0.001, "loss": 1.8365, "step": 729400 }, { "epoch": 42.66830438088554, "grad_norm": 0.2289678454399109, "learning_rate": 0.001, "loss": 1.8297, "step": 729500 }, { "epoch": 42.67415336023864, "grad_norm": 0.36226412653923035, "learning_rate": 0.001, "loss": 1.8325, "step": 729600 }, { "epoch": 42.68000233959174, "grad_norm": 0.23222284018993378, "learning_rate": 0.001, "loss": 1.831, "step": 729700 }, { "epoch": 42.68585131894484, "grad_norm": 0.18549032509326935, "learning_rate": 0.001, "loss": 1.8361, "step": 729800 }, { "epoch": 42.691700298297945, "grad_norm": 0.189226433634758, "learning_rate": 0.001, "loss": 1.8309, "step": 729900 }, { "epoch": 42.69754927765105, "grad_norm": 0.1670263111591339, "learning_rate": 0.001, "loss": 1.8292, "step": 730000 }, { "epoch": 42.70339825700415, "grad_norm": 0.14646323025226593, "learning_rate": 0.001, "loss": 1.8214, "step": 730100 }, { "epoch": 42.70924723635726, "grad_norm": 0.15431655943393707, "learning_rate": 0.001, "loss": 1.8303, "step": 730200 }, { "epoch": 42.71509621571036, "grad_norm": 0.23063668608665466, "learning_rate": 0.001, "loss": 1.8329, "step": 730300 }, { "epoch": 42.720945195063464, "grad_norm": 0.25384438037872314, "learning_rate": 0.001, "loss": 1.8325, "step": 730400 }, { "epoch": 42.72679417441656, "grad_norm": 0.20772185921669006, "learning_rate": 0.001, "loss": 1.8286, "step": 730500 }, { "epoch": 42.732643153769665, "grad_norm": 0.15436652302742004, "learning_rate": 0.001, "loss": 1.8283, "step": 730600 }, { "epoch": 42.73849213312277, "grad_norm": 0.17535772919654846, "learning_rate": 0.001, "loss": 1.8305, "step": 730700 }, { "epoch": 42.74434111247587, "grad_norm": 0.1799711138010025, "learning_rate": 0.001, "loss": 1.8321, "step": 730800 }, { "epoch": 42.750190091828976, "grad_norm": 0.22230295836925507, "learning_rate": 0.001, "loss": 1.828, "step": 730900 }, { "epoch": 42.75603907118208, "grad_norm": 0.16698214411735535, "learning_rate": 0.001, "loss": 1.8328, "step": 731000 }, { "epoch": 42.761888050535184, "grad_norm": 0.2511850893497467, "learning_rate": 0.001, "loss": 1.8339, "step": 731100 }, { "epoch": 42.76773702988829, "grad_norm": 0.24240975081920624, "learning_rate": 0.001, "loss": 1.8327, "step": 731200 }, { "epoch": 42.773586009241384, "grad_norm": 0.16939634084701538, "learning_rate": 0.001, "loss": 1.8324, "step": 731300 }, { "epoch": 42.77943498859449, "grad_norm": 0.19960620999336243, "learning_rate": 0.001, "loss": 1.8299, "step": 731400 }, { "epoch": 42.78528396794759, "grad_norm": 0.15251578390598297, "learning_rate": 0.001, "loss": 1.8302, "step": 731500 }, { "epoch": 42.791132947300696, "grad_norm": 0.17023666203022003, "learning_rate": 0.001, "loss": 1.8247, "step": 731600 }, { "epoch": 42.7969819266538, "grad_norm": 0.20800618827342987, "learning_rate": 0.001, "loss": 1.8279, "step": 731700 }, { "epoch": 42.8028309060069, "grad_norm": 0.15497787296772003, "learning_rate": 0.001, "loss": 1.8279, "step": 731800 }, { "epoch": 42.80867988536001, "grad_norm": 0.2251001000404358, "learning_rate": 0.001, "loss": 1.8279, "step": 731900 }, { "epoch": 42.81452886471311, "grad_norm": 0.1832069754600525, "learning_rate": 0.001, "loss": 1.8348, "step": 732000 }, { "epoch": 42.82037784406621, "grad_norm": 0.16975629329681396, "learning_rate": 0.001, "loss": 1.8292, "step": 732100 }, { "epoch": 42.82622682341931, "grad_norm": 0.18519464135169983, "learning_rate": 0.001, "loss": 1.8386, "step": 732200 }, { "epoch": 42.832075802772415, "grad_norm": 0.4717586934566498, "learning_rate": 0.001, "loss": 1.83, "step": 732300 }, { "epoch": 42.83792478212552, "grad_norm": 0.16855083405971527, "learning_rate": 0.001, "loss": 1.8307, "step": 732400 }, { "epoch": 42.84377376147862, "grad_norm": 0.18788626790046692, "learning_rate": 0.001, "loss": 1.8314, "step": 732500 }, { "epoch": 42.84962274083173, "grad_norm": 0.1989366114139557, "learning_rate": 0.001, "loss": 1.8267, "step": 732600 }, { "epoch": 42.85547172018483, "grad_norm": 0.14593075215816498, "learning_rate": 0.001, "loss": 1.8315, "step": 732700 }, { "epoch": 42.86132069953793, "grad_norm": 0.21156807243824005, "learning_rate": 0.001, "loss": 1.8265, "step": 732800 }, { "epoch": 42.86716967889103, "grad_norm": 0.20679983496665955, "learning_rate": 0.001, "loss": 1.8371, "step": 732900 }, { "epoch": 42.873018658244135, "grad_norm": 0.1610647439956665, "learning_rate": 0.001, "loss": 1.8296, "step": 733000 }, { "epoch": 42.87886763759724, "grad_norm": 0.2834353446960449, "learning_rate": 0.001, "loss": 1.8301, "step": 733100 }, { "epoch": 42.88471661695034, "grad_norm": 0.19304819405078888, "learning_rate": 0.001, "loss": 1.8341, "step": 733200 }, { "epoch": 42.890565596303446, "grad_norm": 0.2199736386537552, "learning_rate": 0.001, "loss": 1.8359, "step": 733300 }, { "epoch": 42.89641457565655, "grad_norm": 0.19320422410964966, "learning_rate": 0.001, "loss": 1.8317, "step": 733400 }, { "epoch": 42.902263555009654, "grad_norm": 0.20594753324985504, "learning_rate": 0.001, "loss": 1.8332, "step": 733500 }, { "epoch": 42.90811253436275, "grad_norm": 0.2305251657962799, "learning_rate": 0.001, "loss": 1.8295, "step": 733600 }, { "epoch": 42.913961513715854, "grad_norm": 0.18283706903457642, "learning_rate": 0.001, "loss": 1.8398, "step": 733700 }, { "epoch": 42.91981049306896, "grad_norm": 0.26328232884407043, "learning_rate": 0.001, "loss": 1.8316, "step": 733800 }, { "epoch": 42.92565947242206, "grad_norm": 0.22565355896949768, "learning_rate": 0.001, "loss": 1.8336, "step": 733900 }, { "epoch": 42.931508451775166, "grad_norm": 0.18880043923854828, "learning_rate": 0.001, "loss": 1.836, "step": 734000 }, { "epoch": 42.93735743112827, "grad_norm": 0.17208720743656158, "learning_rate": 0.001, "loss": 1.8301, "step": 734100 }, { "epoch": 42.94320641048137, "grad_norm": 0.1923162043094635, "learning_rate": 0.001, "loss": 1.8317, "step": 734200 }, { "epoch": 42.94905538983448, "grad_norm": 0.19210700690746307, "learning_rate": 0.001, "loss": 1.8378, "step": 734300 }, { "epoch": 42.954904369187574, "grad_norm": 0.17670303583145142, "learning_rate": 0.001, "loss": 1.8311, "step": 734400 }, { "epoch": 42.96075334854068, "grad_norm": 0.2361510545015335, "learning_rate": 0.001, "loss": 1.835, "step": 734500 }, { "epoch": 42.96660232789378, "grad_norm": 0.19659961760044098, "learning_rate": 0.001, "loss": 1.8318, "step": 734600 }, { "epoch": 42.972451307246885, "grad_norm": 0.17867572605609894, "learning_rate": 0.001, "loss": 1.8324, "step": 734700 }, { "epoch": 42.97830028659999, "grad_norm": 0.17730677127838135, "learning_rate": 0.001, "loss": 1.8284, "step": 734800 }, { "epoch": 42.98414926595309, "grad_norm": 0.1667163074016571, "learning_rate": 0.001, "loss": 1.83, "step": 734900 }, { "epoch": 42.9899982453062, "grad_norm": 0.17092958092689514, "learning_rate": 0.001, "loss": 1.8262, "step": 735000 }, { "epoch": 42.9958472246593, "grad_norm": 0.17706657946109772, "learning_rate": 0.001, "loss": 1.8349, "step": 735100 }, { "epoch": 43.0016962040124, "grad_norm": 0.16728012263774872, "learning_rate": 0.001, "loss": 1.8322, "step": 735200 }, { "epoch": 43.0075451833655, "grad_norm": 0.1726408451795578, "learning_rate": 0.001, "loss": 1.8181, "step": 735300 }, { "epoch": 43.013394162718605, "grad_norm": 0.20764026045799255, "learning_rate": 0.001, "loss": 1.8219, "step": 735400 }, { "epoch": 43.01924314207171, "grad_norm": 0.2415199875831604, "learning_rate": 0.001, "loss": 1.8174, "step": 735500 }, { "epoch": 43.02509212142481, "grad_norm": 0.18808327615261078, "learning_rate": 0.001, "loss": 1.8141, "step": 735600 }, { "epoch": 43.030941100777916, "grad_norm": 0.18332462012767792, "learning_rate": 0.001, "loss": 1.819, "step": 735700 }, { "epoch": 43.03679008013102, "grad_norm": 0.1895163357257843, "learning_rate": 0.001, "loss": 1.8186, "step": 735800 }, { "epoch": 43.04263905948412, "grad_norm": 0.2049260139465332, "learning_rate": 0.001, "loss": 1.8155, "step": 735900 }, { "epoch": 43.04848803883722, "grad_norm": 0.2235790491104126, "learning_rate": 0.001, "loss": 1.822, "step": 736000 }, { "epoch": 43.054337018190324, "grad_norm": 0.17826275527477264, "learning_rate": 0.001, "loss": 1.8226, "step": 736100 }, { "epoch": 43.06018599754343, "grad_norm": 0.1676456481218338, "learning_rate": 0.001, "loss": 1.8223, "step": 736200 }, { "epoch": 43.06603497689653, "grad_norm": 0.1615382730960846, "learning_rate": 0.001, "loss": 1.819, "step": 736300 }, { "epoch": 43.071883956249636, "grad_norm": 0.16701841354370117, "learning_rate": 0.001, "loss": 1.8148, "step": 736400 }, { "epoch": 43.07773293560274, "grad_norm": 0.17180375754833221, "learning_rate": 0.001, "loss": 1.8176, "step": 736500 }, { "epoch": 43.08358191495584, "grad_norm": 0.18109235167503357, "learning_rate": 0.001, "loss": 1.8165, "step": 736600 }, { "epoch": 43.08943089430894, "grad_norm": 0.19212502241134644, "learning_rate": 0.001, "loss": 1.8171, "step": 736700 }, { "epoch": 43.095279873662044, "grad_norm": 0.16780982911586761, "learning_rate": 0.001, "loss": 1.8225, "step": 736800 }, { "epoch": 43.10112885301515, "grad_norm": 0.19425269961357117, "learning_rate": 0.001, "loss": 1.8233, "step": 736900 }, { "epoch": 43.10697783236825, "grad_norm": 0.1504608392715454, "learning_rate": 0.001, "loss": 1.8182, "step": 737000 }, { "epoch": 43.112826811721355, "grad_norm": 0.1963227242231369, "learning_rate": 0.001, "loss": 1.8225, "step": 737100 }, { "epoch": 43.11867579107446, "grad_norm": 0.18002305924892426, "learning_rate": 0.001, "loss": 1.8203, "step": 737200 }, { "epoch": 43.12452477042756, "grad_norm": 0.23448221385478973, "learning_rate": 0.001, "loss": 1.8227, "step": 737300 }, { "epoch": 43.13037374978067, "grad_norm": 0.16451771557331085, "learning_rate": 0.001, "loss": 1.8183, "step": 737400 }, { "epoch": 43.13622272913376, "grad_norm": 0.19012269377708435, "learning_rate": 0.001, "loss": 1.8218, "step": 737500 }, { "epoch": 43.14207170848687, "grad_norm": 0.20944969356060028, "learning_rate": 0.001, "loss": 1.8207, "step": 737600 }, { "epoch": 43.14792068783997, "grad_norm": 0.17180098593235016, "learning_rate": 0.001, "loss": 1.8244, "step": 737700 }, { "epoch": 43.153769667193075, "grad_norm": 0.19568230211734772, "learning_rate": 0.001, "loss": 1.8257, "step": 737800 }, { "epoch": 43.15961864654618, "grad_norm": 0.2233809232711792, "learning_rate": 0.001, "loss": 1.8243, "step": 737900 }, { "epoch": 43.16546762589928, "grad_norm": 0.1800101101398468, "learning_rate": 0.001, "loss": 1.8232, "step": 738000 }, { "epoch": 43.171316605252386, "grad_norm": 0.19208592176437378, "learning_rate": 0.001, "loss": 1.8222, "step": 738100 }, { "epoch": 43.17716558460549, "grad_norm": 0.2060570865869522, "learning_rate": 0.001, "loss": 1.8199, "step": 738200 }, { "epoch": 43.18301456395859, "grad_norm": 0.1553080528974533, "learning_rate": 0.001, "loss": 1.8189, "step": 738300 }, { "epoch": 43.18886354331169, "grad_norm": 0.1864234209060669, "learning_rate": 0.001, "loss": 1.8182, "step": 738400 }, { "epoch": 43.194712522664794, "grad_norm": 0.1693667620420456, "learning_rate": 0.001, "loss": 1.8204, "step": 738500 }, { "epoch": 43.2005615020179, "grad_norm": 0.20661373436450958, "learning_rate": 0.001, "loss": 1.8259, "step": 738600 }, { "epoch": 43.206410481371, "grad_norm": 0.16198687255382538, "learning_rate": 0.001, "loss": 1.8253, "step": 738700 }, { "epoch": 43.212259460724106, "grad_norm": 0.17205315828323364, "learning_rate": 0.001, "loss": 1.8171, "step": 738800 }, { "epoch": 43.21810844007721, "grad_norm": 0.20887959003448486, "learning_rate": 0.001, "loss": 1.8219, "step": 738900 }, { "epoch": 43.223957419430306, "grad_norm": 0.18377579748630524, "learning_rate": 0.001, "loss": 1.8205, "step": 739000 }, { "epoch": 43.22980639878341, "grad_norm": 0.2422381192445755, "learning_rate": 0.001, "loss": 1.8234, "step": 739100 }, { "epoch": 43.235655378136514, "grad_norm": 0.21582092344760895, "learning_rate": 0.001, "loss": 1.8307, "step": 739200 }, { "epoch": 43.24150435748962, "grad_norm": 0.17627890408039093, "learning_rate": 0.001, "loss": 1.8229, "step": 739300 }, { "epoch": 43.24735333684272, "grad_norm": 0.15149454772472382, "learning_rate": 0.001, "loss": 1.8213, "step": 739400 }, { "epoch": 43.253202316195825, "grad_norm": 0.25380170345306396, "learning_rate": 0.001, "loss": 1.8261, "step": 739500 }, { "epoch": 43.25905129554893, "grad_norm": 0.19701527059078217, "learning_rate": 0.001, "loss": 1.8264, "step": 739600 }, { "epoch": 43.26490027490203, "grad_norm": 0.14131776988506317, "learning_rate": 0.001, "loss": 1.8151, "step": 739700 }, { "epoch": 43.27074925425513, "grad_norm": 0.1914328932762146, "learning_rate": 0.001, "loss": 1.819, "step": 739800 }, { "epoch": 43.27659823360823, "grad_norm": 0.19090823829174042, "learning_rate": 0.001, "loss": 1.8267, "step": 739900 }, { "epoch": 43.28244721296134, "grad_norm": 0.2303151786327362, "learning_rate": 0.001, "loss": 1.8233, "step": 740000 }, { "epoch": 43.28829619231444, "grad_norm": 0.20873449742794037, "learning_rate": 0.001, "loss": 1.8246, "step": 740100 }, { "epoch": 43.294145171667545, "grad_norm": 0.22239691019058228, "learning_rate": 0.001, "loss": 1.8246, "step": 740200 }, { "epoch": 43.29999415102065, "grad_norm": 0.19991280138492584, "learning_rate": 0.001, "loss": 1.8274, "step": 740300 }, { "epoch": 43.30584313037375, "grad_norm": 0.1599837839603424, "learning_rate": 0.001, "loss": 1.8222, "step": 740400 }, { "epoch": 43.311692109726856, "grad_norm": 0.19781067967414856, "learning_rate": 0.001, "loss": 1.8257, "step": 740500 }, { "epoch": 43.31754108907995, "grad_norm": 0.17941300570964813, "learning_rate": 0.001, "loss": 1.8189, "step": 740600 }, { "epoch": 43.323390068433056, "grad_norm": 0.22711816430091858, "learning_rate": 0.001, "loss": 1.8217, "step": 740700 }, { "epoch": 43.32923904778616, "grad_norm": 0.23676016926765442, "learning_rate": 0.001, "loss": 1.8255, "step": 740800 }, { "epoch": 43.335088027139264, "grad_norm": 0.18568089604377747, "learning_rate": 0.001, "loss": 1.8239, "step": 740900 }, { "epoch": 43.34093700649237, "grad_norm": 0.18755479156970978, "learning_rate": 0.001, "loss": 1.8249, "step": 741000 }, { "epoch": 43.34678598584547, "grad_norm": 0.2205437570810318, "learning_rate": 0.001, "loss": 1.8204, "step": 741100 }, { "epoch": 43.352634965198575, "grad_norm": 0.20516495406627655, "learning_rate": 0.001, "loss": 1.8253, "step": 741200 }, { "epoch": 43.35848394455168, "grad_norm": 0.2073173075914383, "learning_rate": 0.001, "loss": 1.8272, "step": 741300 }, { "epoch": 43.364332923904776, "grad_norm": 0.15135863423347473, "learning_rate": 0.001, "loss": 1.8253, "step": 741400 }, { "epoch": 43.37018190325788, "grad_norm": 0.1623510718345642, "learning_rate": 0.001, "loss": 1.8234, "step": 741500 }, { "epoch": 43.37603088261098, "grad_norm": 0.15858778357505798, "learning_rate": 0.001, "loss": 1.8291, "step": 741600 }, { "epoch": 43.38187986196409, "grad_norm": 0.16419966518878937, "learning_rate": 0.001, "loss": 1.8231, "step": 741700 }, { "epoch": 43.38772884131719, "grad_norm": 0.18322283029556274, "learning_rate": 0.001, "loss": 1.8248, "step": 741800 }, { "epoch": 43.393577820670295, "grad_norm": 0.18131956458091736, "learning_rate": 0.001, "loss": 1.8204, "step": 741900 }, { "epoch": 43.3994268000234, "grad_norm": 0.2427024096250534, "learning_rate": 0.001, "loss": 1.827, "step": 742000 }, { "epoch": 43.405275779376495, "grad_norm": 0.18407972157001495, "learning_rate": 0.001, "loss": 1.825, "step": 742100 }, { "epoch": 43.4111247587296, "grad_norm": 0.13932651281356812, "learning_rate": 0.001, "loss": 1.8222, "step": 742200 }, { "epoch": 43.4169737380827, "grad_norm": 0.1773647964000702, "learning_rate": 0.001, "loss": 1.8258, "step": 742300 }, { "epoch": 43.42282271743581, "grad_norm": 0.19394497573375702, "learning_rate": 0.001, "loss": 1.8263, "step": 742400 }, { "epoch": 43.42867169678891, "grad_norm": 0.15809471905231476, "learning_rate": 0.001, "loss": 1.8228, "step": 742500 }, { "epoch": 43.434520676142014, "grad_norm": 0.19428789615631104, "learning_rate": 0.001, "loss": 1.8233, "step": 742600 }, { "epoch": 43.44036965549512, "grad_norm": 0.13508406281471252, "learning_rate": 0.001, "loss": 1.8185, "step": 742700 }, { "epoch": 43.44621863484822, "grad_norm": 0.206324502825737, "learning_rate": 0.001, "loss": 1.8222, "step": 742800 }, { "epoch": 43.45206761420132, "grad_norm": 0.19466811418533325, "learning_rate": 0.001, "loss": 1.8306, "step": 742900 }, { "epoch": 43.45791659355442, "grad_norm": 0.2282906025648117, "learning_rate": 0.001, "loss": 1.8281, "step": 743000 }, { "epoch": 43.463765572907526, "grad_norm": 0.18955253064632416, "learning_rate": 0.001, "loss": 1.8282, "step": 743100 }, { "epoch": 43.46961455226063, "grad_norm": 0.18397250771522522, "learning_rate": 0.001, "loss": 1.8234, "step": 743200 }, { "epoch": 43.475463531613734, "grad_norm": 0.2689410150051117, "learning_rate": 0.001, "loss": 1.8325, "step": 743300 }, { "epoch": 43.48131251096684, "grad_norm": 0.20572270452976227, "learning_rate": 0.001, "loss": 1.8231, "step": 743400 }, { "epoch": 43.48716149031994, "grad_norm": 0.17382146418094635, "learning_rate": 0.001, "loss": 1.8232, "step": 743500 }, { "epoch": 43.493010469673045, "grad_norm": 0.1889781504869461, "learning_rate": 0.001, "loss": 1.8271, "step": 743600 }, { "epoch": 43.49885944902614, "grad_norm": 0.23120388388633728, "learning_rate": 0.001, "loss": 1.8276, "step": 743700 }, { "epoch": 43.504708428379246, "grad_norm": 0.13673962652683258, "learning_rate": 0.001, "loss": 1.8287, "step": 743800 }, { "epoch": 43.51055740773235, "grad_norm": 0.1461697518825531, "learning_rate": 0.001, "loss": 1.8243, "step": 743900 }, { "epoch": 43.51640638708545, "grad_norm": 0.1502522975206375, "learning_rate": 0.001, "loss": 1.8265, "step": 744000 }, { "epoch": 43.52225536643856, "grad_norm": 0.17700015008449554, "learning_rate": 0.001, "loss": 1.8265, "step": 744100 }, { "epoch": 43.52810434579166, "grad_norm": 0.3042341470718384, "learning_rate": 0.001, "loss": 1.8285, "step": 744200 }, { "epoch": 43.533953325144765, "grad_norm": 0.201887845993042, "learning_rate": 0.001, "loss": 1.8203, "step": 744300 }, { "epoch": 43.53980230449787, "grad_norm": 0.1514289677143097, "learning_rate": 0.001, "loss": 1.8244, "step": 744400 }, { "epoch": 43.545651283850965, "grad_norm": 0.16983523964881897, "learning_rate": 0.001, "loss": 1.8295, "step": 744500 }, { "epoch": 43.55150026320407, "grad_norm": 0.1870233118534088, "learning_rate": 0.001, "loss": 1.8318, "step": 744600 }, { "epoch": 43.55734924255717, "grad_norm": 0.14889778196811676, "learning_rate": 0.001, "loss": 1.8238, "step": 744700 }, { "epoch": 43.56319822191028, "grad_norm": 0.20637568831443787, "learning_rate": 0.001, "loss": 1.8236, "step": 744800 }, { "epoch": 43.56904720126338, "grad_norm": 0.18795962631702423, "learning_rate": 0.001, "loss": 1.8284, "step": 744900 }, { "epoch": 43.574896180616484, "grad_norm": 0.20972438156604767, "learning_rate": 0.001, "loss": 1.8248, "step": 745000 }, { "epoch": 43.58074515996959, "grad_norm": 0.1879221647977829, "learning_rate": 0.001, "loss": 1.8289, "step": 745100 }, { "epoch": 43.586594139322685, "grad_norm": 0.22973114252090454, "learning_rate": 0.001, "loss": 1.8264, "step": 745200 }, { "epoch": 43.59244311867579, "grad_norm": 0.14148475229740143, "learning_rate": 0.001, "loss": 1.8231, "step": 745300 }, { "epoch": 43.59829209802889, "grad_norm": 0.18761524558067322, "learning_rate": 0.001, "loss": 1.8203, "step": 745400 }, { "epoch": 43.604141077381996, "grad_norm": 0.17097878456115723, "learning_rate": 0.001, "loss": 1.8298, "step": 745500 }, { "epoch": 43.6099900567351, "grad_norm": 0.21360349655151367, "learning_rate": 0.001, "loss": 1.8207, "step": 745600 }, { "epoch": 43.615839036088204, "grad_norm": 0.20129235088825226, "learning_rate": 0.001, "loss": 1.821, "step": 745700 }, { "epoch": 43.62168801544131, "grad_norm": 0.18321533501148224, "learning_rate": 0.001, "loss": 1.8255, "step": 745800 }, { "epoch": 43.62753699479441, "grad_norm": 0.16309718787670135, "learning_rate": 0.001, "loss": 1.8291, "step": 745900 }, { "epoch": 43.63338597414751, "grad_norm": 0.14205226302146912, "learning_rate": 0.001, "loss": 1.8251, "step": 746000 }, { "epoch": 43.63923495350061, "grad_norm": 0.17378520965576172, "learning_rate": 0.001, "loss": 1.8271, "step": 746100 }, { "epoch": 43.645083932853716, "grad_norm": 0.2011490911245346, "learning_rate": 0.001, "loss": 1.8318, "step": 746200 }, { "epoch": 43.65093291220682, "grad_norm": 0.19348232448101044, "learning_rate": 0.001, "loss": 1.8259, "step": 746300 }, { "epoch": 43.65678189155992, "grad_norm": 0.21655960381031036, "learning_rate": 0.001, "loss": 1.8343, "step": 746400 }, { "epoch": 43.66263087091303, "grad_norm": 0.1782805472612381, "learning_rate": 0.001, "loss": 1.8294, "step": 746500 }, { "epoch": 43.66847985026613, "grad_norm": 0.2733813524246216, "learning_rate": 0.001, "loss": 1.8335, "step": 746600 }, { "epoch": 43.674328829619235, "grad_norm": 0.1927563101053238, "learning_rate": 0.001, "loss": 1.8302, "step": 746700 }, { "epoch": 43.68017780897233, "grad_norm": 0.17064091563224792, "learning_rate": 0.001, "loss": 1.8246, "step": 746800 }, { "epoch": 43.686026788325435, "grad_norm": 0.2132396399974823, "learning_rate": 0.001, "loss": 1.8271, "step": 746900 }, { "epoch": 43.69187576767854, "grad_norm": 0.16825976967811584, "learning_rate": 0.001, "loss": 1.8339, "step": 747000 }, { "epoch": 43.69772474703164, "grad_norm": 0.19866277277469635, "learning_rate": 0.001, "loss": 1.82, "step": 747100 }, { "epoch": 43.70357372638475, "grad_norm": 0.1805083155632019, "learning_rate": 0.001, "loss": 1.8324, "step": 747200 }, { "epoch": 43.70942270573785, "grad_norm": 0.22673673927783966, "learning_rate": 0.001, "loss": 1.8297, "step": 747300 }, { "epoch": 43.715271685090954, "grad_norm": 0.1429571509361267, "learning_rate": 0.001, "loss": 1.8262, "step": 747400 }, { "epoch": 43.72112066444406, "grad_norm": 0.1807010918855667, "learning_rate": 0.001, "loss": 1.8297, "step": 747500 }, { "epoch": 43.726969643797155, "grad_norm": 0.1859305500984192, "learning_rate": 0.001, "loss": 1.8218, "step": 747600 }, { "epoch": 43.73281862315026, "grad_norm": 0.17372946441173553, "learning_rate": 0.001, "loss": 1.8278, "step": 747700 }, { "epoch": 43.73866760250336, "grad_norm": 0.20066095888614655, "learning_rate": 0.001, "loss": 1.8248, "step": 747800 }, { "epoch": 43.744516581856466, "grad_norm": 0.15567894279956818, "learning_rate": 0.001, "loss": 1.822, "step": 747900 }, { "epoch": 43.75036556120957, "grad_norm": 0.1755179911851883, "learning_rate": 0.001, "loss": 1.8267, "step": 748000 }, { "epoch": 43.756214540562674, "grad_norm": 0.15276600420475006, "learning_rate": 0.001, "loss": 1.8223, "step": 748100 }, { "epoch": 43.76206351991578, "grad_norm": 0.17774082720279694, "learning_rate": 0.001, "loss": 1.824, "step": 748200 }, { "epoch": 43.767912499268874, "grad_norm": 0.18910212814807892, "learning_rate": 0.001, "loss": 1.8294, "step": 748300 }, { "epoch": 43.77376147862198, "grad_norm": 0.1741543561220169, "learning_rate": 0.001, "loss": 1.8297, "step": 748400 }, { "epoch": 43.77961045797508, "grad_norm": 0.17906413972377777, "learning_rate": 0.001, "loss": 1.8245, "step": 748500 }, { "epoch": 43.785459437328186, "grad_norm": 0.16831301152706146, "learning_rate": 0.001, "loss": 1.8271, "step": 748600 }, { "epoch": 43.79130841668129, "grad_norm": 0.14888662099838257, "learning_rate": 0.001, "loss": 1.8277, "step": 748700 }, { "epoch": 43.79715739603439, "grad_norm": 0.2300380915403366, "learning_rate": 0.001, "loss": 1.8286, "step": 748800 }, { "epoch": 43.8030063753875, "grad_norm": 0.2407395988702774, "learning_rate": 0.001, "loss": 1.8263, "step": 748900 }, { "epoch": 43.8088553547406, "grad_norm": 0.25558120012283325, "learning_rate": 0.001, "loss": 1.8302, "step": 749000 }, { "epoch": 43.8147043340937, "grad_norm": 0.19853617250919342, "learning_rate": 0.001, "loss": 1.8281, "step": 749100 }, { "epoch": 43.8205533134468, "grad_norm": 0.14848773181438446, "learning_rate": 0.001, "loss": 1.8272, "step": 749200 }, { "epoch": 43.826402292799905, "grad_norm": 0.16438697278499603, "learning_rate": 0.001, "loss": 1.8285, "step": 749300 }, { "epoch": 43.83225127215301, "grad_norm": 0.20254044234752655, "learning_rate": 0.001, "loss": 1.8256, "step": 749400 }, { "epoch": 43.83810025150611, "grad_norm": 0.21393314003944397, "learning_rate": 0.001, "loss": 1.8335, "step": 749500 }, { "epoch": 43.84394923085922, "grad_norm": 0.1690121442079544, "learning_rate": 0.001, "loss": 1.8244, "step": 749600 }, { "epoch": 43.84979821021232, "grad_norm": 0.18779407441616058, "learning_rate": 0.001, "loss": 1.8299, "step": 749700 }, { "epoch": 43.855647189565424, "grad_norm": 0.16052557528018951, "learning_rate": 0.001, "loss": 1.8242, "step": 749800 }, { "epoch": 43.86149616891852, "grad_norm": 0.1700681447982788, "learning_rate": 0.001, "loss": 1.8249, "step": 749900 }, { "epoch": 43.867345148271625, "grad_norm": 0.16964726150035858, "learning_rate": 0.001, "loss": 1.8262, "step": 750000 }, { "epoch": 43.867345148271625, "eval_ag_news_accuracy": 0.23925, "eval_ag_news_bleu_score": 7.393011892218757, "eval_ag_news_bleu_score_sem": 0.702806454435961, "eval_ag_news_emb_cos_sim": 0.7209730744361877, "eval_ag_news_emb_cos_sim_sem": 0.014049835503101349, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.688795804977417, "eval_ag_news_n_ngrams_match_1": 14.515625, "eval_ag_news_n_ngrams_match_2": 4.1953125, "eval_ag_news_n_ngrams_match_3": 1.6328125, "eval_ag_news_num_pred_words": 48.4140625, "eval_ag_news_num_true_words": 45.9375, "eval_ag_news_perplexity": 14.713946786434729, "eval_ag_news_pred_num_tokens": 70.71875, "eval_ag_news_rouge_score": 0.29858755361059625, "eval_ag_news_runtime": 37.3143, "eval_ag_news_samples_per_second": 13.4, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3310449388874862, "eval_ag_news_token_set_f1_sem": 0.00974376359432257, "eval_ag_news_token_set_precision": 0.31282053816823574, "eval_ag_news_token_set_recall": 0.360544874496934, "eval_ag_news_true_num_tokens": 63.9140625, "step": 750000 }, { "epoch": 43.867345148271625, "eval_anthropic_toxic_prompts_accuracy": 0.10359375, "eval_anthropic_toxic_prompts_bleu_score": 44.341176047596846, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.749775841013203, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8969367742538452, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008654193952679634, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03347745561491821, "eval_anthropic_toxic_prompts_loss": 1.2992478609085083, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.1328125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.15625, "eval_anthropic_toxic_prompts_num_pred_words": 14.984375, "eval_anthropic_toxic_prompts_num_true_words": 14.671875, "eval_anthropic_toxic_prompts_perplexity": 3.666537883781889, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.9453125, "eval_anthropic_toxic_prompts_rouge_score": 0.7040722803352784, "eval_anthropic_toxic_prompts_runtime": 29.3302, "eval_anthropic_toxic_prompts_samples_per_second": 17.047, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.7191697076240053, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018152390373585115, "eval_anthropic_toxic_prompts_token_set_precision": 0.7254096171750603, "eval_anthropic_toxic_prompts_token_set_recall": 0.7208957496526758, "eval_anthropic_toxic_prompts_true_num_tokens": 18.0078125, "step": 750000 }, { "epoch": 43.867345148271625, "eval_arxiv_accuracy": 0.375421875, "eval_arxiv_bleu_score": 1.7805774294568035, "eval_arxiv_bleu_score_sem": 0.13513807476031028, "eval_arxiv_emb_cos_sim": 0.5173275470733643, "eval_arxiv_emb_cos_sim_sem": 0.018507421016693115, "eval_arxiv_emb_top1_equal": 0.8828125, "eval_arxiv_emb_top1_equal_sem": 0.02854125387966633, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4443535804748535, "eval_arxiv_n_ngrams_match_1": 13.703125, "eval_arxiv_n_ngrams_match_2": 2.4453125, "eval_arxiv_n_ngrams_match_3": 0.578125, "eval_arxiv_num_pred_words": 54.796875, "eval_arxiv_num_true_words": 85.8359375, "eval_arxiv_perplexity": 31.32302908338049, "eval_arxiv_pred_num_tokens": 125.9296875, "eval_arxiv_rouge_score": 0.17361064393401007, "eval_arxiv_runtime": 29.9268, "eval_arxiv_samples_per_second": 16.707, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.17933532577603714, "eval_arxiv_token_set_f1_sem": 0.007998345923984497, "eval_arxiv_token_set_precision": 0.1217669489831502, "eval_arxiv_token_set_recall": 0.4224273405186962, "eval_arxiv_true_num_tokens": 124.3046875, "step": 750000 }, { "epoch": 43.867345148271625, "eval_python_code_alpaca_accuracy": 0.12840625, "eval_python_code_alpaca_bleu_score": 28.373042699055546, "eval_python_code_alpaca_bleu_score_sem": 1.6166257671483504, "eval_python_code_alpaca_emb_cos_sim": 0.8917979598045349, "eval_python_code_alpaca_emb_cos_sim_sem": 0.006882782094180584, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.444199800491333, "eval_python_code_alpaca_n_ngrams_match_1": 10.390625, "eval_python_code_alpaca_n_ngrams_match_2": 5.734375, "eval_python_code_alpaca_n_ngrams_match_3": 3.265625, "eval_python_code_alpaca_num_pred_words": 17.234375, "eval_python_code_alpaca_num_true_words": 18.4453125, "eval_python_code_alpaca_perplexity": 4.238459172527541, "eval_python_code_alpaca_pred_num_tokens": 23.984375, "eval_python_code_alpaca_rouge_score": 0.6114505913514262, "eval_python_code_alpaca_runtime": 29.0381, "eval_python_code_alpaca_samples_per_second": 17.219, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6245339701886797, "eval_python_code_alpaca_token_set_f1_sem": 0.014337657834239863, "eval_python_code_alpaca_token_set_precision": 0.6116085933215576, "eval_python_code_alpaca_token_set_recall": 0.6449159212933229, "eval_python_code_alpaca_true_num_tokens": 24.2734375, "step": 750000 }, { "epoch": 43.867345148271625, "eval_wikibio_accuracy": 0.368828125, "eval_wikibio_bleu_score": 7.885410634110223, "eval_wikibio_bleu_score_sem": 0.7269483870396118, "eval_wikibio_emb_cos_sim": 0.6261876821517944, "eval_wikibio_emb_cos_sim_sem": 0.02203841134905815, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7377395629882812, "eval_wikibio_n_ngrams_match_1": 15.8125, "eval_wikibio_n_ngrams_match_2": 5.4765625, "eval_wikibio_n_ngrams_match_3": 2.328125, "eval_wikibio_num_pred_words": 55.296875, "eval_wikibio_num_true_words": 52.6484375, "eval_wikibio_perplexity": 15.452017278192635, "eval_wikibio_pred_num_tokens": 107.765625, "eval_wikibio_rouge_score": 0.30853304331198017, "eval_wikibio_runtime": 30.8271, "eval_wikibio_samples_per_second": 16.22, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.33205747777501854, "eval_wikibio_token_set_f1_sem": 0.011904151875693592, "eval_wikibio_token_set_precision": 0.29915514810060023, "eval_wikibio_token_set_recall": 0.4062065902070743, "eval_wikibio_true_num_tokens": 100.7734375, "step": 750000 }, { "epoch": 43.867345148271625, "eval_msmarco_accuracy": 0.39503125, "eval_msmarco_bleu_score": 16.55499502833994, "eval_msmarco_bleu_score_sem": 1.2940271773601841, "eval_msmarco_emb_cos_sim": 0.7798468470573425, "eval_msmarco_emb_cos_sim_sem": 0.015624034218490124, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7268658876419067, "eval_msmarco_n_ngrams_match_1": 28.4921875, "eval_msmarco_n_ngrams_match_2": 12.9453125, "eval_msmarco_n_ngrams_match_3": 7.15625, "eval_msmarco_num_pred_words": 62.0859375, "eval_msmarco_num_true_words": 62.1640625, "eval_msmarco_perplexity": 5.623003139500825, "eval_msmarco_pred_num_tokens": 83.40625, "eval_msmarco_rouge_score": 0.4376199824542668, "eval_msmarco_runtime": 24.8163, "eval_msmarco_samples_per_second": 20.148, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.46900997312775183, "eval_msmarco_token_set_f1_sem": 0.012959717653281926, "eval_msmarco_token_set_precision": 0.43238512271470775, "eval_msmarco_token_set_recall": 0.5290531493916993, "eval_msmarco_true_num_tokens": 79.6875, "step": 750000 }, { "epoch": 43.87319412762473, "grad_norm": 0.13725310564041138, "learning_rate": 0.001, "loss": 1.8267, "step": 750100 }, { "epoch": 43.87904310697783, "grad_norm": 0.1791166067123413, "learning_rate": 0.001, "loss": 1.8254, "step": 750200 }, { "epoch": 43.884892086330936, "grad_norm": 0.23369546234607697, "learning_rate": 0.001, "loss": 1.8277, "step": 750300 }, { "epoch": 43.89074106568404, "grad_norm": 0.19492574036121368, "learning_rate": 0.001, "loss": 1.8281, "step": 750400 }, { "epoch": 43.896590045037144, "grad_norm": 0.19440430402755737, "learning_rate": 0.001, "loss": 1.8256, "step": 750500 }, { "epoch": 43.90243902439025, "grad_norm": 0.25372305512428284, "learning_rate": 0.001, "loss": 1.8214, "step": 750600 }, { "epoch": 43.908288003743344, "grad_norm": 0.19447314739227295, "learning_rate": 0.001, "loss": 1.8238, "step": 750700 }, { "epoch": 43.91413698309645, "grad_norm": 0.17435213923454285, "learning_rate": 0.001, "loss": 1.8275, "step": 750800 }, { "epoch": 43.91998596244955, "grad_norm": 0.21884271502494812, "learning_rate": 0.001, "loss": 1.8304, "step": 750900 }, { "epoch": 43.925834941802655, "grad_norm": 0.1965920776128769, "learning_rate": 0.001, "loss": 1.8314, "step": 751000 }, { "epoch": 43.93168392115576, "grad_norm": 0.25210514664649963, "learning_rate": 0.001, "loss": 1.8322, "step": 751100 }, { "epoch": 43.93753290050886, "grad_norm": 0.21765974164009094, "learning_rate": 0.001, "loss": 1.832, "step": 751200 }, { "epoch": 43.94338187986197, "grad_norm": 0.1785750836133957, "learning_rate": 0.001, "loss": 1.832, "step": 751300 }, { "epoch": 43.949230859215064, "grad_norm": 0.18960438668727875, "learning_rate": 0.001, "loss": 1.8303, "step": 751400 }, { "epoch": 43.95507983856817, "grad_norm": 0.17369545996189117, "learning_rate": 0.001, "loss": 1.8325, "step": 751500 }, { "epoch": 43.96092881792127, "grad_norm": 0.15593276917934418, "learning_rate": 0.001, "loss": 1.8293, "step": 751600 }, { "epoch": 43.966777797274375, "grad_norm": 0.19445590674877167, "learning_rate": 0.001, "loss": 1.8276, "step": 751700 }, { "epoch": 43.97262677662748, "grad_norm": 0.1547880321741104, "learning_rate": 0.001, "loss": 1.8253, "step": 751800 }, { "epoch": 43.97847575598058, "grad_norm": 0.18134184181690216, "learning_rate": 0.001, "loss": 1.8285, "step": 751900 }, { "epoch": 43.984324735333686, "grad_norm": 0.15195591747760773, "learning_rate": 0.001, "loss": 1.8248, "step": 752000 }, { "epoch": 43.99017371468679, "grad_norm": 0.21259821951389313, "learning_rate": 0.001, "loss": 1.8333, "step": 752100 }, { "epoch": 43.99602269403989, "grad_norm": 0.20741553604602814, "learning_rate": 0.001, "loss": 1.8306, "step": 752200 }, { "epoch": 44.00187167339299, "grad_norm": 0.23201650381088257, "learning_rate": 0.001, "loss": 1.8249, "step": 752300 }, { "epoch": 44.007720652746094, "grad_norm": 0.18808339536190033, "learning_rate": 0.001, "loss": 1.8124, "step": 752400 }, { "epoch": 44.0135696320992, "grad_norm": 0.2014758437871933, "learning_rate": 0.001, "loss": 1.8116, "step": 752500 }, { "epoch": 44.0194186114523, "grad_norm": 0.16842418909072876, "learning_rate": 0.001, "loss": 1.8153, "step": 752600 }, { "epoch": 44.025267590805406, "grad_norm": 0.1845240741968155, "learning_rate": 0.001, "loss": 1.8165, "step": 752700 }, { "epoch": 44.03111657015851, "grad_norm": 0.23852267861366272, "learning_rate": 0.001, "loss": 1.8208, "step": 752800 }, { "epoch": 44.03696554951161, "grad_norm": 0.14067277312278748, "learning_rate": 0.001, "loss": 1.8176, "step": 752900 }, { "epoch": 44.04281452886471, "grad_norm": 0.1738857626914978, "learning_rate": 0.001, "loss": 1.8188, "step": 753000 }, { "epoch": 44.048663508217814, "grad_norm": 0.1705171912908554, "learning_rate": 0.001, "loss": 1.8132, "step": 753100 }, { "epoch": 44.05451248757092, "grad_norm": 0.20421962440013885, "learning_rate": 0.001, "loss": 1.8168, "step": 753200 }, { "epoch": 44.06036146692402, "grad_norm": 0.2160249501466751, "learning_rate": 0.001, "loss": 1.8187, "step": 753300 }, { "epoch": 44.066210446277125, "grad_norm": 0.21767890453338623, "learning_rate": 0.001, "loss": 1.8209, "step": 753400 }, { "epoch": 44.07205942563023, "grad_norm": 0.2175343781709671, "learning_rate": 0.001, "loss": 1.8197, "step": 753500 }, { "epoch": 44.07790840498333, "grad_norm": 0.2183070331811905, "learning_rate": 0.001, "loss": 1.822, "step": 753600 }, { "epoch": 44.08375738433644, "grad_norm": 0.17519605159759521, "learning_rate": 0.001, "loss": 1.8231, "step": 753700 }, { "epoch": 44.08960636368953, "grad_norm": 0.23231449723243713, "learning_rate": 0.001, "loss": 1.822, "step": 753800 }, { "epoch": 44.09545534304264, "grad_norm": 0.19700732827186584, "learning_rate": 0.001, "loss": 1.8154, "step": 753900 }, { "epoch": 44.10130432239574, "grad_norm": 0.17606855928897858, "learning_rate": 0.001, "loss": 1.8253, "step": 754000 }, { "epoch": 44.107153301748845, "grad_norm": 0.17887437343597412, "learning_rate": 0.001, "loss": 1.8178, "step": 754100 }, { "epoch": 44.11300228110195, "grad_norm": 0.1532190442085266, "learning_rate": 0.001, "loss": 1.8178, "step": 754200 }, { "epoch": 44.11885126045505, "grad_norm": 0.1865747570991516, "learning_rate": 0.001, "loss": 1.8179, "step": 754300 }, { "epoch": 44.124700239808156, "grad_norm": 0.1630009412765503, "learning_rate": 0.001, "loss": 1.8145, "step": 754400 }, { "epoch": 44.13054921916125, "grad_norm": 0.18829740583896637, "learning_rate": 0.001, "loss": 1.8257, "step": 754500 }, { "epoch": 44.13639819851436, "grad_norm": 0.19097064435482025, "learning_rate": 0.001, "loss": 1.8144, "step": 754600 }, { "epoch": 44.14224717786746, "grad_norm": 0.21318866312503815, "learning_rate": 0.001, "loss": 1.8122, "step": 754700 }, { "epoch": 44.148096157220564, "grad_norm": 0.17298634350299835, "learning_rate": 0.001, "loss": 1.8205, "step": 754800 }, { "epoch": 44.15394513657367, "grad_norm": 0.1683749407529831, "learning_rate": 0.001, "loss": 1.8142, "step": 754900 }, { "epoch": 44.15979411592677, "grad_norm": 0.1826343685388565, "learning_rate": 0.001, "loss": 1.8206, "step": 755000 }, { "epoch": 44.165643095279876, "grad_norm": 0.16194243729114532, "learning_rate": 0.001, "loss": 1.822, "step": 755100 }, { "epoch": 44.17149207463298, "grad_norm": 0.2008083313703537, "learning_rate": 0.001, "loss": 1.8173, "step": 755200 }, { "epoch": 44.177341053986076, "grad_norm": 0.15479826927185059, "learning_rate": 0.001, "loss": 1.8196, "step": 755300 }, { "epoch": 44.18319003333918, "grad_norm": 0.23618009686470032, "learning_rate": 0.001, "loss": 1.8165, "step": 755400 }, { "epoch": 44.189039012692284, "grad_norm": 0.2711072564125061, "learning_rate": 0.001, "loss": 1.8161, "step": 755500 }, { "epoch": 44.19488799204539, "grad_norm": 0.16409000754356384, "learning_rate": 0.001, "loss": 1.8214, "step": 755600 }, { "epoch": 44.20073697139849, "grad_norm": 0.17087073624134064, "learning_rate": 0.001, "loss": 1.8181, "step": 755700 }, { "epoch": 44.206585950751595, "grad_norm": 0.18158367276191711, "learning_rate": 0.001, "loss": 1.8171, "step": 755800 }, { "epoch": 44.2124349301047, "grad_norm": 0.18462364375591278, "learning_rate": 0.001, "loss": 1.8114, "step": 755900 }, { "epoch": 44.2182839094578, "grad_norm": 0.16963288187980652, "learning_rate": 0.001, "loss": 1.8167, "step": 756000 }, { "epoch": 44.2241328888109, "grad_norm": 0.22101469337940216, "learning_rate": 0.001, "loss": 1.8166, "step": 756100 }, { "epoch": 44.229981868164, "grad_norm": 0.17294897139072418, "learning_rate": 0.001, "loss": 1.8205, "step": 756200 }, { "epoch": 44.23583084751711, "grad_norm": 0.17726948857307434, "learning_rate": 0.001, "loss": 1.8216, "step": 756300 }, { "epoch": 44.24167982687021, "grad_norm": 0.20428577065467834, "learning_rate": 0.001, "loss": 1.8178, "step": 756400 }, { "epoch": 44.247528806223315, "grad_norm": 0.1847604662179947, "learning_rate": 0.001, "loss": 1.8259, "step": 756500 }, { "epoch": 44.25337778557642, "grad_norm": 0.21723535656929016, "learning_rate": 0.001, "loss": 1.8187, "step": 756600 }, { "epoch": 44.25922676492952, "grad_norm": 0.23179298639297485, "learning_rate": 0.001, "loss": 1.8174, "step": 756700 }, { "epoch": 44.265075744282626, "grad_norm": 0.17005665600299835, "learning_rate": 0.001, "loss": 1.8193, "step": 756800 }, { "epoch": 44.27092472363572, "grad_norm": 0.17962636053562164, "learning_rate": 0.001, "loss": 1.8213, "step": 756900 }, { "epoch": 44.27677370298883, "grad_norm": 0.20155160129070282, "learning_rate": 0.001, "loss": 1.825, "step": 757000 }, { "epoch": 44.28262268234193, "grad_norm": 0.2031664103269577, "learning_rate": 0.001, "loss": 1.8151, "step": 757100 }, { "epoch": 44.288471661695034, "grad_norm": 0.24935925006866455, "learning_rate": 0.001, "loss": 1.8196, "step": 757200 }, { "epoch": 44.29432064104814, "grad_norm": 0.162889763712883, "learning_rate": 0.001, "loss": 1.8237, "step": 757300 }, { "epoch": 44.30016962040124, "grad_norm": 0.1851022094488144, "learning_rate": 0.001, "loss": 1.8227, "step": 757400 }, { "epoch": 44.306018599754346, "grad_norm": 0.15674246847629547, "learning_rate": 0.001, "loss": 1.8253, "step": 757500 }, { "epoch": 44.31186757910744, "grad_norm": 0.17561359703540802, "learning_rate": 0.001, "loss": 1.8196, "step": 757600 }, { "epoch": 44.317716558460546, "grad_norm": 0.1880045235157013, "learning_rate": 0.001, "loss": 1.8107, "step": 757700 }, { "epoch": 44.32356553781365, "grad_norm": 0.23461540043354034, "learning_rate": 0.001, "loss": 1.8253, "step": 757800 }, { "epoch": 44.329414517166754, "grad_norm": 0.23330195248126984, "learning_rate": 0.001, "loss": 1.821, "step": 757900 }, { "epoch": 44.33526349651986, "grad_norm": 0.17943744361400604, "learning_rate": 0.001, "loss": 1.818, "step": 758000 }, { "epoch": 44.34111247587296, "grad_norm": 0.18433058261871338, "learning_rate": 0.001, "loss": 1.8228, "step": 758100 }, { "epoch": 44.346961455226065, "grad_norm": 0.17543980479240417, "learning_rate": 0.001, "loss": 1.8228, "step": 758200 }, { "epoch": 44.35281043457917, "grad_norm": 0.19982247054576874, "learning_rate": 0.001, "loss": 1.8257, "step": 758300 }, { "epoch": 44.358659413932266, "grad_norm": 0.2259027361869812, "learning_rate": 0.001, "loss": 1.8232, "step": 758400 }, { "epoch": 44.36450839328537, "grad_norm": 0.20086951553821564, "learning_rate": 0.001, "loss": 1.8262, "step": 758500 }, { "epoch": 44.37035737263847, "grad_norm": 0.18399550020694733, "learning_rate": 0.001, "loss": 1.8257, "step": 758600 }, { "epoch": 44.37620635199158, "grad_norm": 0.22878658771514893, "learning_rate": 0.001, "loss": 1.8271, "step": 758700 }, { "epoch": 44.38205533134468, "grad_norm": 0.1817004531621933, "learning_rate": 0.001, "loss": 1.819, "step": 758800 }, { "epoch": 44.387904310697785, "grad_norm": 0.17667321860790253, "learning_rate": 0.001, "loss": 1.8224, "step": 758900 }, { "epoch": 44.39375329005089, "grad_norm": 0.16162380576133728, "learning_rate": 0.001, "loss": 1.823, "step": 759000 }, { "epoch": 44.39960226940399, "grad_norm": 0.17600946128368378, "learning_rate": 0.001, "loss": 1.8203, "step": 759100 }, { "epoch": 44.40545124875709, "grad_norm": 0.18511873483657837, "learning_rate": 0.001, "loss": 1.8206, "step": 759200 }, { "epoch": 44.41130022811019, "grad_norm": 0.1870070844888687, "learning_rate": 0.001, "loss": 1.8235, "step": 759300 }, { "epoch": 44.4171492074633, "grad_norm": 0.22426548600196838, "learning_rate": 0.001, "loss": 1.821, "step": 759400 }, { "epoch": 44.4229981868164, "grad_norm": 0.26387834548950195, "learning_rate": 0.001, "loss": 1.8192, "step": 759500 }, { "epoch": 44.428847166169504, "grad_norm": 0.1687518209218979, "learning_rate": 0.001, "loss": 1.8246, "step": 759600 }, { "epoch": 44.43469614552261, "grad_norm": 0.1723814755678177, "learning_rate": 0.001, "loss": 1.8209, "step": 759700 }, { "epoch": 44.44054512487571, "grad_norm": 0.18719853460788727, "learning_rate": 0.001, "loss": 1.8244, "step": 759800 }, { "epoch": 44.446394104228816, "grad_norm": 0.2131827026605606, "learning_rate": 0.001, "loss": 1.8233, "step": 759900 }, { "epoch": 44.45224308358191, "grad_norm": 0.19894060492515564, "learning_rate": 0.001, "loss": 1.8215, "step": 760000 }, { "epoch": 44.458092062935016, "grad_norm": 0.21758611500263214, "learning_rate": 0.001, "loss": 1.8208, "step": 760100 }, { "epoch": 44.46394104228812, "grad_norm": 0.16916847229003906, "learning_rate": 0.001, "loss": 1.8263, "step": 760200 }, { "epoch": 44.469790021641224, "grad_norm": 0.18590858578681946, "learning_rate": 0.001, "loss": 1.8258, "step": 760300 }, { "epoch": 44.47563900099433, "grad_norm": 0.18971721827983856, "learning_rate": 0.001, "loss": 1.8249, "step": 760400 }, { "epoch": 44.48148798034743, "grad_norm": 0.19328226149082184, "learning_rate": 0.001, "loss": 1.825, "step": 760500 }, { "epoch": 44.487336959700535, "grad_norm": 0.17494845390319824, "learning_rate": 0.001, "loss": 1.8283, "step": 760600 }, { "epoch": 44.49318593905363, "grad_norm": 0.19306756556034088, "learning_rate": 0.001, "loss": 1.8279, "step": 760700 }, { "epoch": 44.499034918406736, "grad_norm": 0.16019664704799652, "learning_rate": 0.001, "loss": 1.8239, "step": 760800 }, { "epoch": 44.50488389775984, "grad_norm": 0.2247260957956314, "learning_rate": 0.001, "loss": 1.8169, "step": 760900 }, { "epoch": 44.51073287711294, "grad_norm": 0.19855444133281708, "learning_rate": 0.001, "loss": 1.8206, "step": 761000 }, { "epoch": 44.51658185646605, "grad_norm": 0.1819285750389099, "learning_rate": 0.001, "loss": 1.8243, "step": 761100 }, { "epoch": 44.52243083581915, "grad_norm": 0.17268820106983185, "learning_rate": 0.001, "loss": 1.8194, "step": 761200 }, { "epoch": 44.528279815172255, "grad_norm": 0.15423324704170227, "learning_rate": 0.001, "loss": 1.8253, "step": 761300 }, { "epoch": 44.53412879452536, "grad_norm": 0.24624913930892944, "learning_rate": 0.001, "loss": 1.817, "step": 761400 }, { "epoch": 44.539977773878455, "grad_norm": 0.23852401971817017, "learning_rate": 0.001, "loss": 1.8225, "step": 761500 }, { "epoch": 44.54582675323156, "grad_norm": 0.16740167140960693, "learning_rate": 0.001, "loss": 1.8224, "step": 761600 }, { "epoch": 44.55167573258466, "grad_norm": 0.19346752762794495, "learning_rate": 0.001, "loss": 1.8295, "step": 761700 }, { "epoch": 44.557524711937766, "grad_norm": 0.16540133953094482, "learning_rate": 0.001, "loss": 1.8269, "step": 761800 }, { "epoch": 44.56337369129087, "grad_norm": 0.21368174254894257, "learning_rate": 0.001, "loss": 1.8273, "step": 761900 }, { "epoch": 44.569222670643974, "grad_norm": 0.15762996673583984, "learning_rate": 0.001, "loss": 1.827, "step": 762000 }, { "epoch": 44.57507164999708, "grad_norm": 0.1689900904893875, "learning_rate": 0.001, "loss": 1.8224, "step": 762100 }, { "epoch": 44.58092062935018, "grad_norm": 0.17701628804206848, "learning_rate": 0.001, "loss": 1.8191, "step": 762200 }, { "epoch": 44.58676960870328, "grad_norm": 0.18869267404079437, "learning_rate": 0.001, "loss": 1.8198, "step": 762300 }, { "epoch": 44.59261858805638, "grad_norm": 0.17952744662761688, "learning_rate": 0.001, "loss": 1.8274, "step": 762400 }, { "epoch": 44.598467567409486, "grad_norm": 0.17749570310115814, "learning_rate": 0.001, "loss": 1.8298, "step": 762500 }, { "epoch": 44.60431654676259, "grad_norm": 0.25608471035957336, "learning_rate": 0.001, "loss": 1.8275, "step": 762600 }, { "epoch": 44.61016552611569, "grad_norm": 0.16992658376693726, "learning_rate": 0.001, "loss": 1.8179, "step": 762700 }, { "epoch": 44.6160145054688, "grad_norm": 0.16260620951652527, "learning_rate": 0.001, "loss": 1.8261, "step": 762800 }, { "epoch": 44.6218634848219, "grad_norm": 0.15503759682178497, "learning_rate": 0.001, "loss": 1.8235, "step": 762900 }, { "epoch": 44.627712464175005, "grad_norm": 0.14284494519233704, "learning_rate": 0.001, "loss": 1.8258, "step": 763000 }, { "epoch": 44.6335614435281, "grad_norm": 0.2237863689661026, "learning_rate": 0.001, "loss": 1.8273, "step": 763100 }, { "epoch": 44.639410422881205, "grad_norm": 0.1763356626033783, "learning_rate": 0.001, "loss": 1.8223, "step": 763200 }, { "epoch": 44.64525940223431, "grad_norm": 0.1937875598669052, "learning_rate": 0.001, "loss": 1.8253, "step": 763300 }, { "epoch": 44.65110838158741, "grad_norm": 0.3114631474018097, "learning_rate": 0.001, "loss": 1.8266, "step": 763400 }, { "epoch": 44.65695736094052, "grad_norm": 0.18136310577392578, "learning_rate": 0.001, "loss": 1.8274, "step": 763500 }, { "epoch": 44.66280634029362, "grad_norm": 0.17481710016727448, "learning_rate": 0.001, "loss": 1.8263, "step": 763600 }, { "epoch": 44.668655319646724, "grad_norm": 0.18725408613681793, "learning_rate": 0.001, "loss": 1.8239, "step": 763700 }, { "epoch": 44.67450429899982, "grad_norm": 0.17296436429023743, "learning_rate": 0.001, "loss": 1.8237, "step": 763800 }, { "epoch": 44.680353278352925, "grad_norm": 0.16611428558826447, "learning_rate": 0.001, "loss": 1.8166, "step": 763900 }, { "epoch": 44.68620225770603, "grad_norm": 0.1567995548248291, "learning_rate": 0.001, "loss": 1.8224, "step": 764000 }, { "epoch": 44.69205123705913, "grad_norm": 0.2398023158311844, "learning_rate": 0.001, "loss": 1.8252, "step": 764100 }, { "epoch": 44.697900216412236, "grad_norm": 0.23560409247875214, "learning_rate": 0.001, "loss": 1.8289, "step": 764200 }, { "epoch": 44.70374919576534, "grad_norm": 0.18050701916217804, "learning_rate": 0.001, "loss": 1.8222, "step": 764300 }, { "epoch": 44.709598175118444, "grad_norm": 0.26023146510124207, "learning_rate": 0.001, "loss": 1.8245, "step": 764400 }, { "epoch": 44.71544715447155, "grad_norm": 0.19690194725990295, "learning_rate": 0.001, "loss": 1.8285, "step": 764500 }, { "epoch": 44.721296133824644, "grad_norm": 0.16116560995578766, "learning_rate": 0.001, "loss": 1.8187, "step": 764600 }, { "epoch": 44.72714511317775, "grad_norm": 0.1431407630443573, "learning_rate": 0.001, "loss": 1.8251, "step": 764700 }, { "epoch": 44.73299409253085, "grad_norm": 0.16444581747055054, "learning_rate": 0.001, "loss": 1.8211, "step": 764800 }, { "epoch": 44.738843071883956, "grad_norm": 0.1775766909122467, "learning_rate": 0.001, "loss": 1.8356, "step": 764900 }, { "epoch": 44.74469205123706, "grad_norm": 0.1617027223110199, "learning_rate": 0.001, "loss": 1.8258, "step": 765000 }, { "epoch": 44.75054103059016, "grad_norm": 0.19039610028266907, "learning_rate": 0.001, "loss": 1.8243, "step": 765100 }, { "epoch": 44.75639000994327, "grad_norm": 0.16911043226718903, "learning_rate": 0.001, "loss": 1.8293, "step": 765200 }, { "epoch": 44.76223898929637, "grad_norm": 0.19672556221485138, "learning_rate": 0.001, "loss": 1.8235, "step": 765300 }, { "epoch": 44.76808796864947, "grad_norm": 0.17955723404884338, "learning_rate": 0.001, "loss": 1.8312, "step": 765400 }, { "epoch": 44.77393694800257, "grad_norm": 0.17735791206359863, "learning_rate": 0.001, "loss": 1.8266, "step": 765500 }, { "epoch": 44.779785927355675, "grad_norm": 0.21774128079414368, "learning_rate": 0.001, "loss": 1.8259, "step": 765600 }, { "epoch": 44.78563490670878, "grad_norm": 0.16394931077957153, "learning_rate": 0.001, "loss": 1.8229, "step": 765700 }, { "epoch": 44.79148388606188, "grad_norm": 0.1535056084394455, "learning_rate": 0.001, "loss": 1.8188, "step": 765800 }, { "epoch": 44.79733286541499, "grad_norm": 0.15904280543327332, "learning_rate": 0.001, "loss": 1.8245, "step": 765900 }, { "epoch": 44.80318184476809, "grad_norm": 0.22922609746456146, "learning_rate": 0.001, "loss": 1.8248, "step": 766000 }, { "epoch": 44.809030824121194, "grad_norm": 0.2695280909538269, "learning_rate": 0.001, "loss": 1.821, "step": 766100 }, { "epoch": 44.81487980347429, "grad_norm": 0.1756490021944046, "learning_rate": 0.001, "loss": 1.8273, "step": 766200 }, { "epoch": 44.820728782827395, "grad_norm": 0.1906689703464508, "learning_rate": 0.001, "loss": 1.8226, "step": 766300 }, { "epoch": 44.8265777621805, "grad_norm": 0.2247869074344635, "learning_rate": 0.001, "loss": 1.8296, "step": 766400 }, { "epoch": 44.8324267415336, "grad_norm": 0.1961369812488556, "learning_rate": 0.001, "loss": 1.8267, "step": 766500 }, { "epoch": 44.838275720886706, "grad_norm": 0.20978538691997528, "learning_rate": 0.001, "loss": 1.8275, "step": 766600 }, { "epoch": 44.84412470023981, "grad_norm": 0.21208782494068146, "learning_rate": 0.001, "loss": 1.825, "step": 766700 }, { "epoch": 44.849973679592914, "grad_norm": 0.1549428403377533, "learning_rate": 0.001, "loss": 1.8227, "step": 766800 }, { "epoch": 44.85582265894601, "grad_norm": 0.18137618899345398, "learning_rate": 0.001, "loss": 1.8243, "step": 766900 }, { "epoch": 44.861671638299114, "grad_norm": 0.1688028872013092, "learning_rate": 0.001, "loss": 1.8216, "step": 767000 }, { "epoch": 44.86752061765222, "grad_norm": 0.18220138549804688, "learning_rate": 0.001, "loss": 1.8273, "step": 767100 }, { "epoch": 44.87336959700532, "grad_norm": 0.21260209381580353, "learning_rate": 0.001, "loss": 1.826, "step": 767200 }, { "epoch": 44.879218576358426, "grad_norm": 0.23042362928390503, "learning_rate": 0.001, "loss": 1.8303, "step": 767300 }, { "epoch": 44.88506755571153, "grad_norm": 0.17619256675243378, "learning_rate": 0.001, "loss": 1.8269, "step": 767400 }, { "epoch": 44.89091653506463, "grad_norm": 0.1989208459854126, "learning_rate": 0.001, "loss": 1.8252, "step": 767500 }, { "epoch": 44.89676551441774, "grad_norm": 0.18088240921497345, "learning_rate": 0.001, "loss": 1.825, "step": 767600 }, { "epoch": 44.902614493770834, "grad_norm": 0.20878556370735168, "learning_rate": 0.001, "loss": 1.8206, "step": 767700 }, { "epoch": 44.90846347312394, "grad_norm": 0.19941361248493195, "learning_rate": 0.001, "loss": 1.8259, "step": 767800 }, { "epoch": 44.91431245247704, "grad_norm": 0.1916632205247879, "learning_rate": 0.001, "loss": 1.8289, "step": 767900 }, { "epoch": 44.920161431830145, "grad_norm": 0.2125074863433838, "learning_rate": 0.001, "loss": 1.8276, "step": 768000 }, { "epoch": 44.92601041118325, "grad_norm": 0.18522077798843384, "learning_rate": 0.001, "loss": 1.8229, "step": 768100 }, { "epoch": 44.93185939053635, "grad_norm": 0.17296762764453888, "learning_rate": 0.001, "loss": 1.8245, "step": 768200 }, { "epoch": 44.93770836988946, "grad_norm": 0.15053997933864594, "learning_rate": 0.001, "loss": 1.8272, "step": 768300 }, { "epoch": 44.94355734924256, "grad_norm": 0.18884825706481934, "learning_rate": 0.001, "loss": 1.8237, "step": 768400 }, { "epoch": 44.94940632859566, "grad_norm": 0.1724419891834259, "learning_rate": 0.001, "loss": 1.8226, "step": 768500 }, { "epoch": 44.95525530794876, "grad_norm": 0.17818042635917664, "learning_rate": 0.001, "loss": 1.8243, "step": 768600 }, { "epoch": 44.961104287301865, "grad_norm": 0.15003922581672668, "learning_rate": 0.001, "loss": 1.8251, "step": 768700 }, { "epoch": 44.96695326665497, "grad_norm": 0.2699049115180969, "learning_rate": 0.001, "loss": 1.832, "step": 768800 }, { "epoch": 44.97280224600807, "grad_norm": 0.18719761073589325, "learning_rate": 0.001, "loss": 1.828, "step": 768900 }, { "epoch": 44.978651225361176, "grad_norm": 0.18021784722805023, "learning_rate": 0.001, "loss": 1.8238, "step": 769000 }, { "epoch": 44.98450020471428, "grad_norm": 0.2097311019897461, "learning_rate": 0.001, "loss": 1.8301, "step": 769100 }, { "epoch": 44.990349184067384, "grad_norm": 0.16285483539104462, "learning_rate": 0.001, "loss": 1.8234, "step": 769200 }, { "epoch": 44.99619816342048, "grad_norm": 0.16785961389541626, "learning_rate": 0.001, "loss": 1.826, "step": 769300 }, { "epoch": 45.002047142773584, "grad_norm": 0.18388986587524414, "learning_rate": 0.001, "loss": 1.8265, "step": 769400 }, { "epoch": 45.00789612212669, "grad_norm": 0.15771105885505676, "learning_rate": 0.001, "loss": 1.8131, "step": 769500 }, { "epoch": 45.01374510147979, "grad_norm": 0.2174263894557953, "learning_rate": 0.001, "loss": 1.8101, "step": 769600 }, { "epoch": 45.019594080832896, "grad_norm": 0.16572339832782745, "learning_rate": 0.001, "loss": 1.8095, "step": 769700 }, { "epoch": 45.025443060186, "grad_norm": 0.19700290262699127, "learning_rate": 0.001, "loss": 1.8122, "step": 769800 }, { "epoch": 45.0312920395391, "grad_norm": 0.23886844515800476, "learning_rate": 0.001, "loss": 1.8135, "step": 769900 }, { "epoch": 45.0371410188922, "grad_norm": 0.192087784409523, "learning_rate": 0.001, "loss": 1.8145, "step": 770000 }, { "epoch": 45.042989998245304, "grad_norm": 0.16002006828784943, "learning_rate": 0.001, "loss": 1.8117, "step": 770100 }, { "epoch": 45.04883897759841, "grad_norm": 0.15867425501346588, "learning_rate": 0.001, "loss": 1.8108, "step": 770200 }, { "epoch": 45.05468795695151, "grad_norm": 0.171270951628685, "learning_rate": 0.001, "loss": 1.8078, "step": 770300 }, { "epoch": 45.060536936304615, "grad_norm": 0.16910795867443085, "learning_rate": 0.001, "loss": 1.8164, "step": 770400 }, { "epoch": 45.06638591565772, "grad_norm": 0.2132987678050995, "learning_rate": 0.001, "loss": 1.8141, "step": 770500 }, { "epoch": 45.07223489501082, "grad_norm": 0.18500813841819763, "learning_rate": 0.001, "loss": 1.8126, "step": 770600 }, { "epoch": 45.07808387436393, "grad_norm": 0.20977507531642914, "learning_rate": 0.001, "loss": 1.8158, "step": 770700 }, { "epoch": 45.08393285371702, "grad_norm": 0.22608768939971924, "learning_rate": 0.001, "loss": 1.8103, "step": 770800 }, { "epoch": 45.08978183307013, "grad_norm": 0.1766618937253952, "learning_rate": 0.001, "loss": 1.8123, "step": 770900 }, { "epoch": 45.09563081242323, "grad_norm": 0.219323992729187, "learning_rate": 0.001, "loss": 1.813, "step": 771000 }, { "epoch": 45.101479791776335, "grad_norm": 0.16660349071025848, "learning_rate": 0.001, "loss": 1.8106, "step": 771100 }, { "epoch": 45.10732877112944, "grad_norm": 0.1859140843153, "learning_rate": 0.001, "loss": 1.8188, "step": 771200 }, { "epoch": 45.11317775048254, "grad_norm": 0.23595157265663147, "learning_rate": 0.001, "loss": 1.8187, "step": 771300 }, { "epoch": 45.119026729835646, "grad_norm": 0.250771701335907, "learning_rate": 0.001, "loss": 1.8178, "step": 771400 }, { "epoch": 45.12487570918875, "grad_norm": 0.20442546904087067, "learning_rate": 0.001, "loss": 1.8113, "step": 771500 }, { "epoch": 45.13072468854185, "grad_norm": 0.2374405562877655, "learning_rate": 0.001, "loss": 1.8137, "step": 771600 }, { "epoch": 45.13657366789495, "grad_norm": 0.16556675732135773, "learning_rate": 0.001, "loss": 1.813, "step": 771700 }, { "epoch": 45.142422647248054, "grad_norm": 0.17417730391025543, "learning_rate": 0.001, "loss": 1.8182, "step": 771800 }, { "epoch": 45.14827162660116, "grad_norm": 0.17915740609169006, "learning_rate": 0.001, "loss": 1.8172, "step": 771900 }, { "epoch": 45.15412060595426, "grad_norm": 0.20417337119579315, "learning_rate": 0.001, "loss": 1.8158, "step": 772000 }, { "epoch": 45.159969585307365, "grad_norm": 0.35151857137680054, "learning_rate": 0.001, "loss": 1.814, "step": 772100 }, { "epoch": 45.16581856466047, "grad_norm": 0.21016328036785126, "learning_rate": 0.001, "loss": 1.8177, "step": 772200 }, { "epoch": 45.17166754401357, "grad_norm": 0.24383334815502167, "learning_rate": 0.001, "loss": 1.8098, "step": 772300 }, { "epoch": 45.17751652336667, "grad_norm": 0.24441437423229218, "learning_rate": 0.001, "loss": 1.8213, "step": 772400 }, { "epoch": 45.183365502719774, "grad_norm": 0.15312018990516663, "learning_rate": 0.001, "loss": 1.8199, "step": 772500 }, { "epoch": 45.18921448207288, "grad_norm": 0.20344127714633942, "learning_rate": 0.001, "loss": 1.8187, "step": 772600 }, { "epoch": 45.19506346142598, "grad_norm": 0.24644076824188232, "learning_rate": 0.001, "loss": 1.8173, "step": 772700 }, { "epoch": 45.200912440779085, "grad_norm": 0.18453119695186615, "learning_rate": 0.001, "loss": 1.8159, "step": 772800 }, { "epoch": 45.20676142013219, "grad_norm": 0.23413749039173126, "learning_rate": 0.001, "loss": 1.8225, "step": 772900 }, { "epoch": 45.21261039948529, "grad_norm": 0.18369372189044952, "learning_rate": 0.001, "loss": 1.8195, "step": 773000 }, { "epoch": 45.21845937883839, "grad_norm": 0.20539267361164093, "learning_rate": 0.001, "loss": 1.8198, "step": 773100 }, { "epoch": 45.22430835819149, "grad_norm": 0.21234725415706635, "learning_rate": 0.001, "loss": 1.8131, "step": 773200 }, { "epoch": 45.2301573375446, "grad_norm": 0.19456328451633453, "learning_rate": 0.001, "loss": 1.818, "step": 773300 }, { "epoch": 45.2360063168977, "grad_norm": 0.16152669489383698, "learning_rate": 0.001, "loss": 1.8165, "step": 773400 }, { "epoch": 45.241855296250804, "grad_norm": 0.22072310745716095, "learning_rate": 0.001, "loss": 1.8148, "step": 773500 }, { "epoch": 45.24770427560391, "grad_norm": 0.19803239405155182, "learning_rate": 0.001, "loss": 1.821, "step": 773600 }, { "epoch": 45.25355325495701, "grad_norm": 0.24069933593273163, "learning_rate": 0.001, "loss": 1.8236, "step": 773700 }, { "epoch": 45.259402234310116, "grad_norm": 0.21049585938453674, "learning_rate": 0.001, "loss": 1.8193, "step": 773800 }, { "epoch": 45.26525121366321, "grad_norm": 0.22669199109077454, "learning_rate": 0.001, "loss": 1.8174, "step": 773900 }, { "epoch": 45.271100193016316, "grad_norm": 0.19610676169395447, "learning_rate": 0.001, "loss": 1.8161, "step": 774000 }, { "epoch": 45.27694917236942, "grad_norm": 0.1734716296195984, "learning_rate": 0.001, "loss": 1.8137, "step": 774100 }, { "epoch": 45.282798151722524, "grad_norm": 0.21717773377895355, "learning_rate": 0.001, "loss": 1.8163, "step": 774200 }, { "epoch": 45.28864713107563, "grad_norm": 0.19892527163028717, "learning_rate": 0.001, "loss": 1.817, "step": 774300 }, { "epoch": 45.29449611042873, "grad_norm": 0.1810588389635086, "learning_rate": 0.001, "loss": 1.8183, "step": 774400 }, { "epoch": 45.300345089781835, "grad_norm": 0.20801076292991638, "learning_rate": 0.001, "loss": 1.8188, "step": 774500 }, { "epoch": 45.30619406913494, "grad_norm": 0.2017236351966858, "learning_rate": 0.001, "loss": 1.819, "step": 774600 }, { "epoch": 45.312043048488036, "grad_norm": 0.24974334239959717, "learning_rate": 0.001, "loss": 1.8217, "step": 774700 }, { "epoch": 45.31789202784114, "grad_norm": 0.23418499529361725, "learning_rate": 0.001, "loss": 1.8244, "step": 774800 }, { "epoch": 45.32374100719424, "grad_norm": 0.18484684824943542, "learning_rate": 0.001, "loss": 1.8234, "step": 774900 }, { "epoch": 45.32958998654735, "grad_norm": 0.19107134640216827, "learning_rate": 0.001, "loss": 1.8158, "step": 775000 }, { "epoch": 45.32958998654735, "eval_ag_news_accuracy": 0.2413125, "eval_ag_news_bleu_score": 7.577851768066386, "eval_ag_news_bleu_score_sem": 0.5760416771167922, "eval_ag_news_emb_cos_sim": 0.729026734828949, "eval_ag_news_emb_cos_sim_sem": 0.013563760556280613, "eval_ag_news_emb_top1_equal": 0.9765625, "eval_ag_news_emb_top1_equal_sem": 0.013424675911664963, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6921563148498535, "eval_ag_news_n_ngrams_match_1": 14.1640625, "eval_ag_news_n_ngrams_match_2": 4.0859375, "eval_ag_news_n_ngrams_match_3": 1.65625, "eval_ag_news_num_pred_words": 45.859375, "eval_ag_news_num_true_words": 44.03125, "eval_ag_news_perplexity": 14.76347632551405, "eval_ag_news_pred_num_tokens": 66.4609375, "eval_ag_news_rouge_score": 0.3080537016234982, "eval_ag_news_runtime": 39.2646, "eval_ag_news_samples_per_second": 12.734, "eval_ag_news_steps_per_second": 0.025, "eval_ag_news_token_set_f1": 0.347836728616714, "eval_ag_news_token_set_f1_sem": 0.010234294219313354, "eval_ag_news_token_set_precision": 0.3328379454676147, "eval_ag_news_token_set_recall": 0.36935480292895095, "eval_ag_news_true_num_tokens": 60.5625, "step": 775000 }, { "epoch": 45.32958998654735, "eval_anthropic_toxic_prompts_accuracy": 0.10275, "eval_anthropic_toxic_prompts_bleu_score": 39.19196508680865, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.369517328894437, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8907265663146973, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008418033830821514, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.09375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.025864720370468334, "eval_anthropic_toxic_prompts_loss": 1.2645035982131958, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.9765625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.4453125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.46875, "eval_anthropic_toxic_prompts_num_pred_words": 14.828125, "eval_anthropic_toxic_prompts_num_true_words": 14.34375, "eval_anthropic_toxic_prompts_perplexity": 3.541334375006868, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.8515625, "eval_anthropic_toxic_prompts_rouge_score": 0.6726788234124881, "eval_anthropic_toxic_prompts_runtime": 30.9478, "eval_anthropic_toxic_prompts_samples_per_second": 16.156, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.6912605022418963, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017472680307536737, "eval_anthropic_toxic_prompts_token_set_precision": 0.705990912065403, "eval_anthropic_toxic_prompts_token_set_recall": 0.6881671204467533, "eval_anthropic_toxic_prompts_true_num_tokens": 17.53125, "step": 775000 }, { "epoch": 45.32958998654735, "eval_arxiv_accuracy": 0.37578125, "eval_arxiv_bleu_score": 1.6896162443052472, "eval_arxiv_bleu_score_sem": 0.1632735696520221, "eval_arxiv_emb_cos_sim": 0.4289698600769043, "eval_arxiv_emb_cos_sim_sem": 0.017900411039590836, "eval_arxiv_emb_top1_equal": 0.875, "eval_arxiv_emb_top1_equal_sem": 0.029346559196710587, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4440643787384033, "eval_arxiv_n_ngrams_match_1": 13.5234375, "eval_arxiv_n_ngrams_match_2": 2.203125, "eval_arxiv_n_ngrams_match_3": 0.4453125, "eval_arxiv_num_pred_words": 56.671875, "eval_arxiv_num_true_words": 86.5, "eval_arxiv_perplexity": 31.313971718744614, "eval_arxiv_pred_num_tokens": 126.15625, "eval_arxiv_rouge_score": 0.17384867188401001, "eval_arxiv_runtime": 32.9012, "eval_arxiv_samples_per_second": 15.197, "eval_arxiv_steps_per_second": 0.03, "eval_arxiv_token_set_f1": 0.16826520690732913, "eval_arxiv_token_set_f1_sem": 0.008060274393709797, "eval_arxiv_token_set_precision": 0.11544367038490835, "eval_arxiv_token_set_recall": 0.40269136412548406, "eval_arxiv_true_num_tokens": 124.9375, "step": 775000 }, { "epoch": 45.32958998654735, "eval_python_code_alpaca_accuracy": 0.129703125, "eval_python_code_alpaca_bleu_score": 26.56160066660749, "eval_python_code_alpaca_bleu_score_sem": 1.6351584351662465, "eval_python_code_alpaca_emb_cos_sim": 0.864230215549469, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009466725401580334, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4603147506713867, "eval_python_code_alpaca_n_ngrams_match_1": 10.375, "eval_python_code_alpaca_n_ngrams_match_2": 5.4765625, "eval_python_code_alpaca_n_ngrams_match_3": 3.109375, "eval_python_code_alpaca_num_pred_words": 18.3515625, "eval_python_code_alpaca_num_true_words": 18.671875, "eval_python_code_alpaca_perplexity": 4.307315045311464, "eval_python_code_alpaca_pred_num_tokens": 24.203125, "eval_python_code_alpaca_rouge_score": 0.5814384670880415, "eval_python_code_alpaca_runtime": 32.4418, "eval_python_code_alpaca_samples_per_second": 15.412, "eval_python_code_alpaca_steps_per_second": 0.031, "eval_python_code_alpaca_token_set_f1": 0.6007765132728617, "eval_python_code_alpaca_token_set_f1_sem": 0.013643427991176007, "eval_python_code_alpaca_token_set_precision": 0.5937191958625798, "eval_python_code_alpaca_token_set_recall": 0.6128237017105018, "eval_python_code_alpaca_true_num_tokens": 24.359375, "step": 775000 }, { "epoch": 45.32958998654735, "eval_wikibio_accuracy": 0.368796875, "eval_wikibio_bleu_score": 9.5905207071442, "eval_wikibio_bleu_score_sem": 0.9039383516329843, "eval_wikibio_emb_cos_sim": 0.6485955119132996, "eval_wikibio_emb_cos_sim_sem": 0.022157175466418266, "eval_wikibio_emb_top1_equal": 0.9453125, "eval_wikibio_emb_top1_equal_sem": 0.020175758749246597, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7021443843841553, "eval_wikibio_n_ngrams_match_1": 15.6796875, "eval_wikibio_n_ngrams_match_2": 5.7265625, "eval_wikibio_n_ngrams_match_3": 2.5546875, "eval_wikibio_num_pred_words": 49.734375, "eval_wikibio_num_true_words": 48.5234375, "eval_wikibio_perplexity": 14.911673825054313, "eval_wikibio_pred_num_tokens": 97.7265625, "eval_wikibio_rouge_score": 0.3318121355348751, "eval_wikibio_runtime": 32.3169, "eval_wikibio_samples_per_second": 15.472, "eval_wikibio_steps_per_second": 0.031, "eval_wikibio_token_set_f1": 0.34878426092040116, "eval_wikibio_token_set_f1_sem": 0.012424428448786547, "eval_wikibio_token_set_precision": 0.3158355192469382, "eval_wikibio_token_set_recall": 0.4237240979797543, "eval_wikibio_true_num_tokens": 92.546875, "step": 775000 }, { "epoch": 45.32958998654735, "eval_msmarco_accuracy": 0.400515625, "eval_msmarco_bleu_score": 16.062755322757827, "eval_msmarco_bleu_score_sem": 1.3909088423555334, "eval_msmarco_emb_cos_sim": 0.7697697877883911, "eval_msmarco_emb_cos_sim_sem": 0.017976997420191765, "eval_msmarco_emb_top1_equal": 0.9375, "eval_msmarco_emb_top1_equal_sem": 0.02147948183119297, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7331979274749756, "eval_msmarco_n_ngrams_match_1": 28.859375, "eval_msmarco_n_ngrams_match_2": 12.828125, "eval_msmarco_n_ngrams_match_3": 7.140625, "eval_msmarco_num_pred_words": 66.7265625, "eval_msmarco_num_true_words": 64.5390625, "eval_msmarco_perplexity": 5.658721184060041, "eval_msmarco_pred_num_tokens": 90.6171875, "eval_msmarco_rouge_score": 0.4245174129424166, "eval_msmarco_runtime": 26.0588, "eval_msmarco_samples_per_second": 19.187, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.4549954656043991, "eval_msmarco_token_set_f1_sem": 0.014129992365951496, "eval_msmarco_token_set_precision": 0.4178175764852007, "eval_msmarco_token_set_recall": 0.5346404293968642, "eval_msmarco_true_num_tokens": 84.34375, "step": 775000 }, { "epoch": 45.33543896590045, "grad_norm": 0.22917936742305756, "learning_rate": 0.001, "loss": 1.8193, "step": 775100 }, { "epoch": 45.341287945253555, "grad_norm": 0.1797250211238861, "learning_rate": 0.001, "loss": 1.8216, "step": 775200 }, { "epoch": 45.34713692460666, "grad_norm": 0.2916063666343689, "learning_rate": 0.001, "loss": 1.8199, "step": 775300 }, { "epoch": 45.35298590395976, "grad_norm": 0.17425021529197693, "learning_rate": 0.001, "loss": 1.8149, "step": 775400 }, { "epoch": 45.35883488331286, "grad_norm": 0.17002175748348236, "learning_rate": 0.001, "loss": 1.8161, "step": 775500 }, { "epoch": 45.36468386266596, "grad_norm": 0.1985417753458023, "learning_rate": 0.001, "loss": 1.819, "step": 775600 }, { "epoch": 45.37053284201907, "grad_norm": 0.17142029106616974, "learning_rate": 0.001, "loss": 1.8146, "step": 775700 }, { "epoch": 45.37638182137217, "grad_norm": 0.17045195400714874, "learning_rate": 0.001, "loss": 1.8234, "step": 775800 }, { "epoch": 45.382230800725274, "grad_norm": 0.32402509450912476, "learning_rate": 0.001, "loss": 1.8141, "step": 775900 }, { "epoch": 45.38807978007838, "grad_norm": 0.2562015652656555, "learning_rate": 0.001, "loss": 1.8195, "step": 776000 }, { "epoch": 45.39392875943148, "grad_norm": 0.1943957805633545, "learning_rate": 0.001, "loss": 1.8264, "step": 776100 }, { "epoch": 45.39977773878458, "grad_norm": 0.17245802283287048, "learning_rate": 0.001, "loss": 1.8212, "step": 776200 }, { "epoch": 45.40562671813768, "grad_norm": 0.22363057732582092, "learning_rate": 0.001, "loss": 1.8185, "step": 776300 }, { "epoch": 45.411475697490786, "grad_norm": 0.15849538147449493, "learning_rate": 0.001, "loss": 1.8187, "step": 776400 }, { "epoch": 45.41732467684389, "grad_norm": 0.1999543458223343, "learning_rate": 0.001, "loss": 1.8211, "step": 776500 }, { "epoch": 45.423173656196994, "grad_norm": 0.2284635603427887, "learning_rate": 0.001, "loss": 1.8155, "step": 776600 }, { "epoch": 45.4290226355501, "grad_norm": 0.18357022106647491, "learning_rate": 0.001, "loss": 1.8204, "step": 776700 }, { "epoch": 45.4348716149032, "grad_norm": 0.2138507217168808, "learning_rate": 0.001, "loss": 1.8185, "step": 776800 }, { "epoch": 45.440720594256305, "grad_norm": 0.22328947484493256, "learning_rate": 0.001, "loss": 1.8203, "step": 776900 }, { "epoch": 45.4465695736094, "grad_norm": 0.2091543823480606, "learning_rate": 0.001, "loss": 1.8214, "step": 777000 }, { "epoch": 45.452418552962506, "grad_norm": 0.1900198757648468, "learning_rate": 0.001, "loss": 1.8225, "step": 777100 }, { "epoch": 45.45826753231561, "grad_norm": 0.18333293497562408, "learning_rate": 0.001, "loss": 1.8218, "step": 777200 }, { "epoch": 45.46411651166871, "grad_norm": 0.20378118753433228, "learning_rate": 0.001, "loss": 1.823, "step": 777300 }, { "epoch": 45.46996549102182, "grad_norm": 0.17818360030651093, "learning_rate": 0.001, "loss": 1.8094, "step": 777400 }, { "epoch": 45.47581447037492, "grad_norm": 0.22152413427829742, "learning_rate": 0.001, "loss": 1.8184, "step": 777500 }, { "epoch": 45.481663449728025, "grad_norm": 0.2045465111732483, "learning_rate": 0.001, "loss": 1.811, "step": 777600 }, { "epoch": 45.48751242908113, "grad_norm": 0.18012288212776184, "learning_rate": 0.001, "loss": 1.8208, "step": 777700 }, { "epoch": 45.493361408434225, "grad_norm": 0.368995726108551, "learning_rate": 0.001, "loss": 1.8223, "step": 777800 }, { "epoch": 45.49921038778733, "grad_norm": 0.18087144196033478, "learning_rate": 0.001, "loss": 1.8199, "step": 777900 }, { "epoch": 45.50505936714043, "grad_norm": 0.19600169360637665, "learning_rate": 0.001, "loss": 1.8222, "step": 778000 }, { "epoch": 45.51090834649354, "grad_norm": 0.17728127539157867, "learning_rate": 0.001, "loss": 1.8148, "step": 778100 }, { "epoch": 45.51675732584664, "grad_norm": 0.25955629348754883, "learning_rate": 0.001, "loss": 1.8225, "step": 778200 }, { "epoch": 45.522606305199744, "grad_norm": 0.21140344440937042, "learning_rate": 0.001, "loss": 1.8165, "step": 778300 }, { "epoch": 45.52845528455285, "grad_norm": 0.2309628427028656, "learning_rate": 0.001, "loss": 1.8167, "step": 778400 }, { "epoch": 45.53430426390595, "grad_norm": 0.20004801452159882, "learning_rate": 0.001, "loss": 1.8215, "step": 778500 }, { "epoch": 45.54015324325905, "grad_norm": 0.22565209865570068, "learning_rate": 0.001, "loss": 1.8169, "step": 778600 }, { "epoch": 45.54600222261215, "grad_norm": 0.3358253240585327, "learning_rate": 0.001, "loss": 1.8207, "step": 778700 }, { "epoch": 45.551851201965256, "grad_norm": 0.1839466542005539, "learning_rate": 0.001, "loss": 1.8207, "step": 778800 }, { "epoch": 45.55770018131836, "grad_norm": 0.29678821563720703, "learning_rate": 0.001, "loss": 1.827, "step": 778900 }, { "epoch": 45.563549160671464, "grad_norm": 0.2244640588760376, "learning_rate": 0.001, "loss": 1.8223, "step": 779000 }, { "epoch": 45.56939814002457, "grad_norm": 0.1762627512216568, "learning_rate": 0.001, "loss": 1.8236, "step": 779100 }, { "epoch": 45.57524711937767, "grad_norm": 0.22779230773448944, "learning_rate": 0.001, "loss": 1.8177, "step": 779200 }, { "epoch": 45.58109609873077, "grad_norm": 0.21607066690921783, "learning_rate": 0.001, "loss": 1.8229, "step": 779300 }, { "epoch": 45.58694507808387, "grad_norm": 0.20546643435955048, "learning_rate": 0.001, "loss": 1.8213, "step": 779400 }, { "epoch": 45.592794057436976, "grad_norm": 0.14818090200424194, "learning_rate": 0.001, "loss": 1.8222, "step": 779500 }, { "epoch": 45.59864303679008, "grad_norm": 0.20307260751724243, "learning_rate": 0.001, "loss": 1.8205, "step": 779600 }, { "epoch": 45.60449201614318, "grad_norm": 0.2072141319513321, "learning_rate": 0.001, "loss": 1.8271, "step": 779700 }, { "epoch": 45.61034099549629, "grad_norm": 0.2028447389602661, "learning_rate": 0.001, "loss": 1.812, "step": 779800 }, { "epoch": 45.61618997484939, "grad_norm": 0.1859346181154251, "learning_rate": 0.001, "loss": 1.8163, "step": 779900 }, { "epoch": 45.622038954202495, "grad_norm": 0.2066309154033661, "learning_rate": 0.001, "loss": 1.82, "step": 780000 }, { "epoch": 45.62788793355559, "grad_norm": 0.2107744663953781, "learning_rate": 0.001, "loss": 1.8197, "step": 780100 }, { "epoch": 45.633736912908695, "grad_norm": 0.23848478496074677, "learning_rate": 0.001, "loss": 1.8177, "step": 780200 }, { "epoch": 45.6395858922618, "grad_norm": 0.1669883131980896, "learning_rate": 0.001, "loss": 1.8223, "step": 780300 }, { "epoch": 45.6454348716149, "grad_norm": 0.204730823636055, "learning_rate": 0.001, "loss": 1.8233, "step": 780400 }, { "epoch": 45.65128385096801, "grad_norm": 0.18369045853614807, "learning_rate": 0.001, "loss": 1.8222, "step": 780500 }, { "epoch": 45.65713283032111, "grad_norm": 0.21047142148017883, "learning_rate": 0.001, "loss": 1.8221, "step": 780600 }, { "epoch": 45.662981809674214, "grad_norm": 0.17416317760944366, "learning_rate": 0.001, "loss": 1.8266, "step": 780700 }, { "epoch": 45.66883078902732, "grad_norm": 0.20644783973693848, "learning_rate": 0.001, "loss": 1.818, "step": 780800 }, { "epoch": 45.674679768380415, "grad_norm": 0.17060764133930206, "learning_rate": 0.001, "loss": 1.8188, "step": 780900 }, { "epoch": 45.68052874773352, "grad_norm": 0.19203156232833862, "learning_rate": 0.001, "loss": 1.8188, "step": 781000 }, { "epoch": 45.68637772708662, "grad_norm": 0.22316576540470123, "learning_rate": 0.001, "loss": 1.8242, "step": 781100 }, { "epoch": 45.692226706439726, "grad_norm": 0.21954087913036346, "learning_rate": 0.001, "loss": 1.8254, "step": 781200 }, { "epoch": 45.69807568579283, "grad_norm": 0.1736965924501419, "learning_rate": 0.001, "loss": 1.8195, "step": 781300 }, { "epoch": 45.703924665145934, "grad_norm": 0.18466226756572723, "learning_rate": 0.001, "loss": 1.8199, "step": 781400 }, { "epoch": 45.70977364449904, "grad_norm": 0.4234849214553833, "learning_rate": 0.001, "loss": 1.8304, "step": 781500 }, { "epoch": 45.71562262385214, "grad_norm": 0.18360696732997894, "learning_rate": 0.001, "loss": 1.8147, "step": 781600 }, { "epoch": 45.72147160320524, "grad_norm": 0.16997015476226807, "learning_rate": 0.001, "loss": 1.8218, "step": 781700 }, { "epoch": 45.72732058255834, "grad_norm": 0.19984029233455658, "learning_rate": 0.001, "loss": 1.8243, "step": 781800 }, { "epoch": 45.733169561911446, "grad_norm": 0.16655103862285614, "learning_rate": 0.001, "loss": 1.8203, "step": 781900 }, { "epoch": 45.73901854126455, "grad_norm": 0.20927177369594574, "learning_rate": 0.001, "loss": 1.8226, "step": 782000 }, { "epoch": 45.74486752061765, "grad_norm": 0.21499241888523102, "learning_rate": 0.001, "loss": 1.8235, "step": 782100 }, { "epoch": 45.75071649997076, "grad_norm": 0.20067258179187775, "learning_rate": 0.001, "loss": 1.8244, "step": 782200 }, { "epoch": 45.75656547932386, "grad_norm": 0.2024761438369751, "learning_rate": 0.001, "loss": 1.8245, "step": 782300 }, { "epoch": 45.76241445867696, "grad_norm": 0.19390223920345306, "learning_rate": 0.001, "loss": 1.8202, "step": 782400 }, { "epoch": 45.76826343803006, "grad_norm": 0.18887607753276825, "learning_rate": 0.001, "loss": 1.8143, "step": 782500 }, { "epoch": 45.774112417383165, "grad_norm": 0.20589059591293335, "learning_rate": 0.001, "loss": 1.8213, "step": 782600 }, { "epoch": 45.77996139673627, "grad_norm": 0.15739738941192627, "learning_rate": 0.001, "loss": 1.829, "step": 782700 }, { "epoch": 45.78581037608937, "grad_norm": 0.200389102101326, "learning_rate": 0.001, "loss": 1.8212, "step": 782800 }, { "epoch": 45.79165935544248, "grad_norm": 0.21401578187942505, "learning_rate": 0.001, "loss": 1.8196, "step": 782900 }, { "epoch": 45.79750833479558, "grad_norm": 0.15766426920890808, "learning_rate": 0.001, "loss": 1.8223, "step": 783000 }, { "epoch": 45.803357314148684, "grad_norm": 0.2001141756772995, "learning_rate": 0.001, "loss": 1.8291, "step": 783100 }, { "epoch": 45.80920629350178, "grad_norm": 0.19254301488399506, "learning_rate": 0.001, "loss": 1.8264, "step": 783200 }, { "epoch": 45.815055272854885, "grad_norm": 0.258603572845459, "learning_rate": 0.001, "loss": 1.8262, "step": 783300 }, { "epoch": 45.82090425220799, "grad_norm": 0.16746559739112854, "learning_rate": 0.001, "loss": 1.8192, "step": 783400 }, { "epoch": 45.82675323156109, "grad_norm": 0.1976192742586136, "learning_rate": 0.001, "loss": 1.8152, "step": 783500 }, { "epoch": 45.832602210914196, "grad_norm": 0.17874109745025635, "learning_rate": 0.001, "loss": 1.8251, "step": 783600 }, { "epoch": 45.8384511902673, "grad_norm": 0.15088069438934326, "learning_rate": 0.001, "loss": 1.8204, "step": 783700 }, { "epoch": 45.8443001696204, "grad_norm": 0.3085401952266693, "learning_rate": 0.001, "loss": 1.8269, "step": 783800 }, { "epoch": 45.85014914897351, "grad_norm": 0.20855006575584412, "learning_rate": 0.001, "loss": 1.8243, "step": 783900 }, { "epoch": 45.855998128326604, "grad_norm": 0.27920424938201904, "learning_rate": 0.001, "loss": 1.8157, "step": 784000 }, { "epoch": 45.86184710767971, "grad_norm": 0.18894948065280914, "learning_rate": 0.001, "loss": 1.8219, "step": 784100 }, { "epoch": 45.86769608703281, "grad_norm": 0.18858087062835693, "learning_rate": 0.001, "loss": 1.8293, "step": 784200 }, { "epoch": 45.873545066385915, "grad_norm": 0.21622566878795624, "learning_rate": 0.001, "loss": 1.824, "step": 784300 }, { "epoch": 45.87939404573902, "grad_norm": 0.2054031640291214, "learning_rate": 0.001, "loss": 1.8246, "step": 784400 }, { "epoch": 45.88524302509212, "grad_norm": 0.23321382701396942, "learning_rate": 0.001, "loss": 1.8257, "step": 784500 }, { "epoch": 45.89109200444523, "grad_norm": 0.1723436415195465, "learning_rate": 0.001, "loss": 1.8274, "step": 784600 }, { "epoch": 45.89694098379833, "grad_norm": 0.2558678686618805, "learning_rate": 0.001, "loss": 1.82, "step": 784700 }, { "epoch": 45.90278996315143, "grad_norm": 0.2162775695323944, "learning_rate": 0.001, "loss": 1.8265, "step": 784800 }, { "epoch": 45.90863894250453, "grad_norm": 0.20311987400054932, "learning_rate": 0.001, "loss": 1.8234, "step": 784900 }, { "epoch": 45.914487921857635, "grad_norm": 0.18818937242031097, "learning_rate": 0.001, "loss": 1.8241, "step": 785000 }, { "epoch": 45.92033690121074, "grad_norm": 0.24159783124923706, "learning_rate": 0.001, "loss": 1.8277, "step": 785100 }, { "epoch": 45.92618588056384, "grad_norm": 0.20467017590999603, "learning_rate": 0.001, "loss": 1.8198, "step": 785200 }, { "epoch": 45.932034859916946, "grad_norm": 0.23605458438396454, "learning_rate": 0.001, "loss": 1.8203, "step": 785300 }, { "epoch": 45.93788383927005, "grad_norm": 0.19299189746379852, "learning_rate": 0.001, "loss": 1.8216, "step": 785400 }, { "epoch": 45.94373281862315, "grad_norm": 0.17641226947307587, "learning_rate": 0.001, "loss": 1.8225, "step": 785500 }, { "epoch": 45.94958179797625, "grad_norm": 0.1881822794675827, "learning_rate": 0.001, "loss": 1.8254, "step": 785600 }, { "epoch": 45.955430777329354, "grad_norm": 0.18285711109638214, "learning_rate": 0.001, "loss": 1.8204, "step": 785700 }, { "epoch": 45.96127975668246, "grad_norm": 0.18937747180461884, "learning_rate": 0.001, "loss": 1.8234, "step": 785800 }, { "epoch": 45.96712873603556, "grad_norm": 0.18900993466377258, "learning_rate": 0.001, "loss": 1.8277, "step": 785900 }, { "epoch": 45.972977715388666, "grad_norm": 0.20552414655685425, "learning_rate": 0.001, "loss": 1.8268, "step": 786000 }, { "epoch": 45.97882669474177, "grad_norm": 0.18365782499313354, "learning_rate": 0.001, "loss": 1.822, "step": 786100 }, { "epoch": 45.98467567409487, "grad_norm": 0.16732746362686157, "learning_rate": 0.001, "loss": 1.8244, "step": 786200 }, { "epoch": 45.99052465344797, "grad_norm": 0.21598340570926666, "learning_rate": 0.001, "loss": 1.8261, "step": 786300 }, { "epoch": 45.996373632801074, "grad_norm": 0.205243781208992, "learning_rate": 0.001, "loss": 1.8248, "step": 786400 }, { "epoch": 46.00222261215418, "grad_norm": 0.21643564105033875, "learning_rate": 0.001, "loss": 1.8132, "step": 786500 }, { "epoch": 46.00807159150728, "grad_norm": 0.2647686302661896, "learning_rate": 0.001, "loss": 1.8122, "step": 786600 }, { "epoch": 46.013920570860385, "grad_norm": 0.3029184341430664, "learning_rate": 0.001, "loss": 1.8106, "step": 786700 }, { "epoch": 46.01976955021349, "grad_norm": 0.20843061804771423, "learning_rate": 0.001, "loss": 1.8049, "step": 786800 }, { "epoch": 46.02561852956659, "grad_norm": 0.1896626353263855, "learning_rate": 0.001, "loss": 1.8097, "step": 786900 }, { "epoch": 46.0314675089197, "grad_norm": 0.2748676836490631, "learning_rate": 0.001, "loss": 1.8163, "step": 787000 }, { "epoch": 46.03731648827279, "grad_norm": 0.28417354822158813, "learning_rate": 0.001, "loss": 1.8135, "step": 787100 }, { "epoch": 46.0431654676259, "grad_norm": 0.1756192296743393, "learning_rate": 0.001, "loss": 1.8147, "step": 787200 }, { "epoch": 46.049014446979, "grad_norm": 0.2774744927883148, "learning_rate": 0.001, "loss": 1.812, "step": 787300 }, { "epoch": 46.054863426332105, "grad_norm": 0.24503639340400696, "learning_rate": 0.001, "loss": 1.8142, "step": 787400 }, { "epoch": 46.06071240568521, "grad_norm": 0.21712058782577515, "learning_rate": 0.001, "loss": 1.8104, "step": 787500 }, { "epoch": 46.06656138503831, "grad_norm": 0.22395603358745575, "learning_rate": 0.001, "loss": 1.8122, "step": 787600 }, { "epoch": 46.072410364391416, "grad_norm": 0.18858422338962555, "learning_rate": 0.001, "loss": 1.8116, "step": 787700 }, { "epoch": 46.07825934374452, "grad_norm": 0.18648125231266022, "learning_rate": 0.001, "loss": 1.8112, "step": 787800 }, { "epoch": 46.08410832309762, "grad_norm": 0.3264257609844208, "learning_rate": 0.001, "loss": 1.8142, "step": 787900 }, { "epoch": 46.08995730245072, "grad_norm": 0.19865888357162476, "learning_rate": 0.001, "loss": 1.8074, "step": 788000 }, { "epoch": 46.095806281803824, "grad_norm": 0.22731254994869232, "learning_rate": 0.001, "loss": 1.8126, "step": 788100 }, { "epoch": 46.10165526115693, "grad_norm": 0.21132396161556244, "learning_rate": 0.001, "loss": 1.8087, "step": 788200 }, { "epoch": 46.10750424051003, "grad_norm": 0.2322874665260315, "learning_rate": 0.001, "loss": 1.8134, "step": 788300 }, { "epoch": 46.113353219863136, "grad_norm": 0.2029511034488678, "learning_rate": 0.001, "loss": 1.82, "step": 788400 }, { "epoch": 46.11920219921624, "grad_norm": 0.24215668439865112, "learning_rate": 0.001, "loss": 1.8127, "step": 788500 }, { "epoch": 46.125051178569336, "grad_norm": 0.26003962755203247, "learning_rate": 0.001, "loss": 1.8129, "step": 788600 }, { "epoch": 46.13090015792244, "grad_norm": 0.26962608098983765, "learning_rate": 0.001, "loss": 1.8136, "step": 788700 }, { "epoch": 46.136749137275544, "grad_norm": 0.2236962914466858, "learning_rate": 0.001, "loss": 1.814, "step": 788800 }, { "epoch": 46.14259811662865, "grad_norm": 0.1685556024312973, "learning_rate": 0.001, "loss": 1.8125, "step": 788900 }, { "epoch": 46.14844709598175, "grad_norm": 0.2081078439950943, "learning_rate": 0.001, "loss": 1.8186, "step": 789000 }, { "epoch": 46.154296075334855, "grad_norm": 0.23187056183815002, "learning_rate": 0.001, "loss": 1.8105, "step": 789100 }, { "epoch": 46.16014505468796, "grad_norm": 0.1882760226726532, "learning_rate": 0.001, "loss": 1.8081, "step": 789200 }, { "epoch": 46.16599403404106, "grad_norm": 0.19233310222625732, "learning_rate": 0.001, "loss": 1.8116, "step": 789300 }, { "epoch": 46.17184301339416, "grad_norm": 0.19782480597496033, "learning_rate": 0.001, "loss": 1.8114, "step": 789400 }, { "epoch": 46.17769199274726, "grad_norm": 0.23655962944030762, "learning_rate": 0.001, "loss": 1.8115, "step": 789500 }, { "epoch": 46.18354097210037, "grad_norm": 0.17639555037021637, "learning_rate": 0.001, "loss": 1.816, "step": 789600 }, { "epoch": 46.18938995145347, "grad_norm": 0.18621191382408142, "learning_rate": 0.001, "loss": 1.8047, "step": 789700 }, { "epoch": 46.195238930806575, "grad_norm": 0.2085781991481781, "learning_rate": 0.001, "loss": 1.8119, "step": 789800 }, { "epoch": 46.20108791015968, "grad_norm": 0.17675426602363586, "learning_rate": 0.001, "loss": 1.8085, "step": 789900 }, { "epoch": 46.20693688951278, "grad_norm": 0.28655096888542175, "learning_rate": 0.001, "loss": 1.8221, "step": 790000 }, { "epoch": 46.212785868865886, "grad_norm": 0.20735731720924377, "learning_rate": 0.001, "loss": 1.8083, "step": 790100 }, { "epoch": 46.21863484821898, "grad_norm": 0.18428722023963928, "learning_rate": 0.001, "loss": 1.812, "step": 790200 }, { "epoch": 46.22448382757209, "grad_norm": 0.17482495307922363, "learning_rate": 0.001, "loss": 1.8106, "step": 790300 }, { "epoch": 46.23033280692519, "grad_norm": 0.21563664078712463, "learning_rate": 0.001, "loss": 1.8138, "step": 790400 }, { "epoch": 46.236181786278294, "grad_norm": 0.2460772544145584, "learning_rate": 0.001, "loss": 1.8168, "step": 790500 }, { "epoch": 46.2420307656314, "grad_norm": 0.22424794733524323, "learning_rate": 0.001, "loss": 1.8131, "step": 790600 }, { "epoch": 46.2478797449845, "grad_norm": 0.1804879605770111, "learning_rate": 0.001, "loss": 1.8148, "step": 790700 }, { "epoch": 46.253728724337606, "grad_norm": 0.2180042564868927, "learning_rate": 0.001, "loss": 1.8161, "step": 790800 }, { "epoch": 46.25957770369071, "grad_norm": 0.19577504694461823, "learning_rate": 0.001, "loss": 1.8164, "step": 790900 }, { "epoch": 46.265426683043806, "grad_norm": 0.2405368685722351, "learning_rate": 0.001, "loss": 1.8193, "step": 791000 }, { "epoch": 46.27127566239691, "grad_norm": 0.23112457990646362, "learning_rate": 0.001, "loss": 1.8227, "step": 791100 }, { "epoch": 46.277124641750014, "grad_norm": 0.19611738622188568, "learning_rate": 0.001, "loss": 1.8174, "step": 791200 }, { "epoch": 46.28297362110312, "grad_norm": 0.21086972951889038, "learning_rate": 0.001, "loss": 1.8105, "step": 791300 }, { "epoch": 46.28882260045622, "grad_norm": 0.17865166068077087, "learning_rate": 0.001, "loss": 1.8087, "step": 791400 }, { "epoch": 46.294671579809325, "grad_norm": 0.1978405863046646, "learning_rate": 0.001, "loss": 1.8163, "step": 791500 }, { "epoch": 46.30052055916243, "grad_norm": 0.19571714103221893, "learning_rate": 0.001, "loss": 1.8157, "step": 791600 }, { "epoch": 46.306369538515526, "grad_norm": 0.21324069797992706, "learning_rate": 0.001, "loss": 1.8168, "step": 791700 }, { "epoch": 46.31221851786863, "grad_norm": 0.25980478525161743, "learning_rate": 0.001, "loss": 1.8164, "step": 791800 }, { "epoch": 46.31806749722173, "grad_norm": 0.20361827313899994, "learning_rate": 0.001, "loss": 1.8222, "step": 791900 }, { "epoch": 46.32391647657484, "grad_norm": 0.21725092828273773, "learning_rate": 0.001, "loss": 1.8183, "step": 792000 }, { "epoch": 46.32976545592794, "grad_norm": 0.21440137922763824, "learning_rate": 0.001, "loss": 1.8156, "step": 792100 }, { "epoch": 46.335614435281045, "grad_norm": 0.1982475221157074, "learning_rate": 0.001, "loss": 1.8144, "step": 792200 }, { "epoch": 46.34146341463415, "grad_norm": 0.19865162670612335, "learning_rate": 0.001, "loss": 1.8189, "step": 792300 }, { "epoch": 46.34731239398725, "grad_norm": 0.20860731601715088, "learning_rate": 0.001, "loss": 1.8154, "step": 792400 }, { "epoch": 46.35316137334035, "grad_norm": 0.18104591965675354, "learning_rate": 0.001, "loss": 1.8163, "step": 792500 }, { "epoch": 46.35901035269345, "grad_norm": 0.1850680559873581, "learning_rate": 0.001, "loss": 1.8183, "step": 792600 }, { "epoch": 46.36485933204656, "grad_norm": 0.2105087786912918, "learning_rate": 0.001, "loss": 1.8158, "step": 792700 }, { "epoch": 46.37070831139966, "grad_norm": 0.17094483971595764, "learning_rate": 0.001, "loss": 1.809, "step": 792800 }, { "epoch": 46.376557290752764, "grad_norm": 0.27020034193992615, "learning_rate": 0.001, "loss": 1.8169, "step": 792900 }, { "epoch": 46.38240627010587, "grad_norm": 0.22244670987129211, "learning_rate": 0.001, "loss": 1.8151, "step": 793000 }, { "epoch": 46.38825524945897, "grad_norm": 0.17703762650489807, "learning_rate": 0.001, "loss": 1.8158, "step": 793100 }, { "epoch": 46.394104228812076, "grad_norm": 0.24032244086265564, "learning_rate": 0.001, "loss": 1.8125, "step": 793200 }, { "epoch": 46.39995320816517, "grad_norm": 0.24174149334430695, "learning_rate": 0.001, "loss": 1.8151, "step": 793300 }, { "epoch": 46.405802187518276, "grad_norm": 0.2156030535697937, "learning_rate": 0.001, "loss": 1.8195, "step": 793400 }, { "epoch": 46.41165116687138, "grad_norm": 0.22429122030735016, "learning_rate": 0.001, "loss": 1.8149, "step": 793500 }, { "epoch": 46.417500146224484, "grad_norm": 0.18837390840053558, "learning_rate": 0.001, "loss": 1.8152, "step": 793600 }, { "epoch": 46.42334912557759, "grad_norm": 0.22470611333847046, "learning_rate": 0.001, "loss": 1.8198, "step": 793700 }, { "epoch": 46.42919810493069, "grad_norm": 0.28657665848731995, "learning_rate": 0.001, "loss": 1.8189, "step": 793800 }, { "epoch": 46.435047084283795, "grad_norm": 0.3214173913002014, "learning_rate": 0.001, "loss": 1.8214, "step": 793900 }, { "epoch": 46.4408960636369, "grad_norm": 0.30502772331237793, "learning_rate": 0.001, "loss": 1.8187, "step": 794000 }, { "epoch": 46.446745042989996, "grad_norm": 0.2373882681131363, "learning_rate": 0.001, "loss": 1.813, "step": 794100 }, { "epoch": 46.4525940223431, "grad_norm": 0.18298093974590302, "learning_rate": 0.001, "loss": 1.8188, "step": 794200 }, { "epoch": 46.4584430016962, "grad_norm": 0.20601288974285126, "learning_rate": 0.001, "loss": 1.819, "step": 794300 }, { "epoch": 46.46429198104931, "grad_norm": 0.21799710392951965, "learning_rate": 0.001, "loss": 1.8159, "step": 794400 }, { "epoch": 46.47014096040241, "grad_norm": 0.21178285777568817, "learning_rate": 0.001, "loss": 1.8199, "step": 794500 }, { "epoch": 46.475989939755515, "grad_norm": 0.20250073075294495, "learning_rate": 0.001, "loss": 1.8169, "step": 794600 }, { "epoch": 46.48183891910862, "grad_norm": 0.18584978580474854, "learning_rate": 0.001, "loss": 1.8149, "step": 794700 }, { "epoch": 46.487687898461715, "grad_norm": 0.2206672728061676, "learning_rate": 0.001, "loss": 1.812, "step": 794800 }, { "epoch": 46.49353687781482, "grad_norm": 0.20775938034057617, "learning_rate": 0.001, "loss": 1.8134, "step": 794900 }, { "epoch": 46.49938585716792, "grad_norm": 0.2198086678981781, "learning_rate": 0.001, "loss": 1.82, "step": 795000 }, { "epoch": 46.505234836521026, "grad_norm": 0.23021966218948364, "learning_rate": 0.001, "loss": 1.8167, "step": 795100 }, { "epoch": 46.51108381587413, "grad_norm": 0.2591667175292969, "learning_rate": 0.001, "loss": 1.8187, "step": 795200 }, { "epoch": 46.516932795227234, "grad_norm": 0.2054297775030136, "learning_rate": 0.001, "loss": 1.8165, "step": 795300 }, { "epoch": 46.52278177458034, "grad_norm": 0.18785659968852997, "learning_rate": 0.001, "loss": 1.8178, "step": 795400 }, { "epoch": 46.52863075393344, "grad_norm": 0.18182869255542755, "learning_rate": 0.001, "loss": 1.8176, "step": 795500 }, { "epoch": 46.53447973328654, "grad_norm": 0.20062750577926636, "learning_rate": 0.001, "loss": 1.8202, "step": 795600 }, { "epoch": 46.54032871263964, "grad_norm": 0.16898980736732483, "learning_rate": 0.001, "loss": 1.8208, "step": 795700 }, { "epoch": 46.546177691992746, "grad_norm": 0.2940455973148346, "learning_rate": 0.001, "loss": 1.8171, "step": 795800 }, { "epoch": 46.55202667134585, "grad_norm": 0.19447962939739227, "learning_rate": 0.001, "loss": 1.8136, "step": 795900 }, { "epoch": 46.55787565069895, "grad_norm": 0.18882256746292114, "learning_rate": 0.001, "loss": 1.8199, "step": 796000 }, { "epoch": 46.56372463005206, "grad_norm": 0.20534853637218475, "learning_rate": 0.001, "loss": 1.8187, "step": 796100 }, { "epoch": 46.56957360940516, "grad_norm": 0.19806867837905884, "learning_rate": 0.001, "loss": 1.8157, "step": 796200 }, { "epoch": 46.575422588758265, "grad_norm": 0.1911357343196869, "learning_rate": 0.001, "loss": 1.8177, "step": 796300 }, { "epoch": 46.58127156811136, "grad_norm": 0.18425670266151428, "learning_rate": 0.001, "loss": 1.8213, "step": 796400 }, { "epoch": 46.587120547464465, "grad_norm": 0.2262556105852127, "learning_rate": 0.001, "loss": 1.8139, "step": 796500 }, { "epoch": 46.59296952681757, "grad_norm": 0.26003533601760864, "learning_rate": 0.001, "loss": 1.819, "step": 796600 }, { "epoch": 46.59881850617067, "grad_norm": 0.3093421757221222, "learning_rate": 0.001, "loss": 1.8253, "step": 796700 }, { "epoch": 46.60466748552378, "grad_norm": 0.28890112042427063, "learning_rate": 0.001, "loss": 1.8174, "step": 796800 }, { "epoch": 46.61051646487688, "grad_norm": 0.19587744772434235, "learning_rate": 0.001, "loss": 1.8206, "step": 796900 }, { "epoch": 46.616365444229984, "grad_norm": 0.24954460561275482, "learning_rate": 0.001, "loss": 1.8177, "step": 797000 }, { "epoch": 46.62221442358309, "grad_norm": 0.23741601407527924, "learning_rate": 0.001, "loss": 1.8217, "step": 797100 }, { "epoch": 46.628063402936185, "grad_norm": 0.23708166182041168, "learning_rate": 0.001, "loss": 1.8207, "step": 797200 }, { "epoch": 46.63391238228929, "grad_norm": 0.24185222387313843, "learning_rate": 0.001, "loss": 1.817, "step": 797300 }, { "epoch": 46.63976136164239, "grad_norm": 0.22730937600135803, "learning_rate": 0.001, "loss": 1.8226, "step": 797400 }, { "epoch": 46.645610340995496, "grad_norm": 0.20307374000549316, "learning_rate": 0.001, "loss": 1.8198, "step": 797500 }, { "epoch": 46.6514593203486, "grad_norm": 0.19036000967025757, "learning_rate": 0.001, "loss": 1.8209, "step": 797600 }, { "epoch": 46.657308299701704, "grad_norm": 0.21380078792572021, "learning_rate": 0.001, "loss": 1.822, "step": 797700 }, { "epoch": 46.66315727905481, "grad_norm": 0.2286577671766281, "learning_rate": 0.001, "loss": 1.8192, "step": 797800 }, { "epoch": 46.669006258407904, "grad_norm": 0.17585985362529755, "learning_rate": 0.001, "loss": 1.8168, "step": 797900 }, { "epoch": 46.67485523776101, "grad_norm": 0.18410493433475494, "learning_rate": 0.001, "loss": 1.817, "step": 798000 }, { "epoch": 46.68070421711411, "grad_norm": 0.14925292134284973, "learning_rate": 0.001, "loss": 1.8215, "step": 798100 }, { "epoch": 46.686553196467216, "grad_norm": 0.31840780377388, "learning_rate": 0.001, "loss": 1.8245, "step": 798200 }, { "epoch": 46.69240217582032, "grad_norm": 0.20316722989082336, "learning_rate": 0.001, "loss": 1.8245, "step": 798300 }, { "epoch": 46.69825115517342, "grad_norm": 0.18832910060882568, "learning_rate": 0.001, "loss": 1.8179, "step": 798400 }, { "epoch": 46.70410013452653, "grad_norm": 0.24638986587524414, "learning_rate": 0.001, "loss": 1.8255, "step": 798500 }, { "epoch": 46.70994911387963, "grad_norm": 0.2483077198266983, "learning_rate": 0.001, "loss": 1.8176, "step": 798600 }, { "epoch": 46.71579809323273, "grad_norm": 0.20388124883174896, "learning_rate": 0.001, "loss": 1.8179, "step": 798700 }, { "epoch": 46.72164707258583, "grad_norm": 0.24701473116874695, "learning_rate": 0.001, "loss": 1.8227, "step": 798800 }, { "epoch": 46.727496051938935, "grad_norm": 0.2044535130262375, "learning_rate": 0.001, "loss": 1.8236, "step": 798900 }, { "epoch": 46.73334503129204, "grad_norm": 0.19224266707897186, "learning_rate": 0.001, "loss": 1.8237, "step": 799000 }, { "epoch": 46.73919401064514, "grad_norm": 0.28085577487945557, "learning_rate": 0.001, "loss": 1.821, "step": 799100 }, { "epoch": 46.74504298999825, "grad_norm": 0.19710470736026764, "learning_rate": 0.001, "loss": 1.8182, "step": 799200 }, { "epoch": 46.75089196935135, "grad_norm": 0.2425040900707245, "learning_rate": 0.001, "loss": 1.8238, "step": 799300 }, { "epoch": 46.756740948704454, "grad_norm": 0.18700434267520905, "learning_rate": 0.001, "loss": 1.8145, "step": 799400 }, { "epoch": 46.76258992805755, "grad_norm": 0.24311399459838867, "learning_rate": 0.001, "loss": 1.8229, "step": 799500 }, { "epoch": 46.768438907410655, "grad_norm": 0.1727420687675476, "learning_rate": 0.001, "loss": 1.8197, "step": 799600 }, { "epoch": 46.77428788676376, "grad_norm": 0.17430859804153442, "learning_rate": 0.001, "loss": 1.8195, "step": 799700 }, { "epoch": 46.78013686611686, "grad_norm": 0.2367011159658432, "learning_rate": 0.001, "loss": 1.8205, "step": 799800 }, { "epoch": 46.785985845469966, "grad_norm": 0.16013364493846893, "learning_rate": 0.001, "loss": 1.82, "step": 799900 }, { "epoch": 46.79183482482307, "grad_norm": 0.1970205157995224, "learning_rate": 0.001, "loss": 1.8293, "step": 800000 }, { "epoch": 46.79183482482307, "eval_ag_news_accuracy": 0.24109375, "eval_ag_news_bleu_score": 7.277663753396659, "eval_ag_news_bleu_score_sem": 0.4943358472985364, "eval_ag_news_emb_cos_sim": 0.7162008881568909, "eval_ag_news_emb_cos_sim_sem": 0.014609021134674549, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7019975185394287, "eval_ag_news_n_ngrams_match_1": 13.6171875, "eval_ag_news_n_ngrams_match_2": 4.1171875, "eval_ag_news_n_ngrams_match_3": 1.5, "eval_ag_news_num_pred_words": 44.2734375, "eval_ag_news_num_true_words": 43.53125, "eval_ag_news_perplexity": 14.909483970293081, "eval_ag_news_pred_num_tokens": 65.625, "eval_ag_news_rouge_score": 0.2974016465275922, "eval_ag_news_runtime": 37.8039, "eval_ag_news_samples_per_second": 13.226, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3330982305569903, "eval_ag_news_token_set_f1_sem": 0.00961042532249291, "eval_ag_news_token_set_precision": 0.3149936001722025, "eval_ag_news_token_set_recall": 0.36275108572206344, "eval_ag_news_true_num_tokens": 59.1484375, "step": 800000 }, { "epoch": 46.79183482482307, "eval_anthropic_toxic_prompts_accuracy": 0.101453125, "eval_anthropic_toxic_prompts_bleu_score": 42.41192546206449, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.4303865366748423, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.894139289855957, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008778074756264687, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02854125206846796, "eval_anthropic_toxic_prompts_loss": 1.2436679601669312, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.015625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.6640625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.6640625, "eval_anthropic_toxic_prompts_num_pred_words": 13.9921875, "eval_anthropic_toxic_prompts_num_true_words": 13.78125, "eval_anthropic_toxic_prompts_perplexity": 3.4683117915452835, "eval_anthropic_toxic_prompts_pred_num_tokens": 17.84375, "eval_anthropic_toxic_prompts_rouge_score": 0.6920747269221392, "eval_anthropic_toxic_prompts_runtime": 28.7769, "eval_anthropic_toxic_prompts_samples_per_second": 17.375, "eval_anthropic_toxic_prompts_steps_per_second": 0.035, "eval_anthropic_toxic_prompts_token_set_f1": 0.7109770025875323, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01685638675309095, "eval_anthropic_toxic_prompts_token_set_precision": 0.7156775764535427, "eval_anthropic_toxic_prompts_token_set_recall": 0.716346266153834, "eval_anthropic_toxic_prompts_true_num_tokens": 16.9609375, "step": 800000 }, { "epoch": 46.79183482482307, "eval_arxiv_accuracy": 0.37603125, "eval_arxiv_bleu_score": 1.782048330712008, "eval_arxiv_bleu_score_sem": 0.14388609337526617, "eval_arxiv_emb_cos_sim": 0.4936535954475403, "eval_arxiv_emb_cos_sim_sem": 0.017598867416381836, "eval_arxiv_emb_top1_equal": 0.9453125, "eval_arxiv_emb_top1_equal_sem": 0.020175758749246597, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.447666883468628, "eval_arxiv_n_ngrams_match_1": 14.203125, "eval_arxiv_n_ngrams_match_2": 2.53125, "eval_arxiv_n_ngrams_match_3": 0.46875, "eval_arxiv_num_pred_words": 57.03125, "eval_arxiv_num_true_words": 85.625, "eval_arxiv_perplexity": 31.426983891203424, "eval_arxiv_pred_num_tokens": 125.734375, "eval_arxiv_rouge_score": 0.1824963531584463, "eval_arxiv_runtime": 30.8809, "eval_arxiv_samples_per_second": 16.191, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.1865635358263714, "eval_arxiv_token_set_f1_sem": 0.008623029368548668, "eval_arxiv_token_set_precision": 0.12848191262598813, "eval_arxiv_token_set_recall": 0.41187288367771246, "eval_arxiv_true_num_tokens": 124.46875, "step": 800000 }, { "epoch": 46.79183482482307, "eval_python_code_alpaca_accuracy": 0.13340625, "eval_python_code_alpaca_bleu_score": 28.08451318282478, "eval_python_code_alpaca_bleu_score_sem": 1.61902541176562, "eval_python_code_alpaca_emb_cos_sim": 0.8786609768867493, "eval_python_code_alpaca_emb_cos_sim_sem": 0.009479982778429985, "eval_python_code_alpaca_emb_top1_equal": 0.984375, "eval_python_code_alpaca_emb_top1_equal_sem": 0.011004959233105183, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4875856637954712, "eval_python_code_alpaca_n_ngrams_match_1": 10.5859375, "eval_python_code_alpaca_n_ngrams_match_2": 5.765625, "eval_python_code_alpaca_n_ngrams_match_3": 3.2890625, "eval_python_code_alpaca_num_pred_words": 17.4140625, "eval_python_code_alpaca_num_true_words": 17.875, "eval_python_code_alpaca_perplexity": 4.426395800205823, "eval_python_code_alpaca_pred_num_tokens": 22.171875, "eval_python_code_alpaca_rouge_score": 0.6179150188312984, "eval_python_code_alpaca_runtime": 30.0938, "eval_python_code_alpaca_samples_per_second": 16.615, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.63107247526944, "eval_python_code_alpaca_token_set_f1_sem": 0.013727163799693737, "eval_python_code_alpaca_token_set_precision": 0.6189360651773886, "eval_python_code_alpaca_token_set_recall": 0.648477483620036, "eval_python_code_alpaca_true_num_tokens": 23.0390625, "step": 800000 }, { "epoch": 46.79183482482307, "eval_wikibio_accuracy": 0.365890625, "eval_wikibio_bleu_score": 7.9325345789787285, "eval_wikibio_bleu_score_sem": 0.7459604802708297, "eval_wikibio_emb_cos_sim": 0.6277192831039429, "eval_wikibio_emb_cos_sim_sem": 0.022885335609316826, "eval_wikibio_emb_top1_equal": 0.9140625, "eval_wikibio_emb_top1_equal_sem": 0.024870097637176514, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.700683832168579, "eval_wikibio_n_ngrams_match_1": 15.5546875, "eval_wikibio_n_ngrams_match_2": 5.40625, "eval_wikibio_n_ngrams_match_3": 2.2890625, "eval_wikibio_num_pred_words": 50.609375, "eval_wikibio_num_true_words": 51.953125, "eval_wikibio_perplexity": 14.889910443957248, "eval_wikibio_pred_num_tokens": 103.1328125, "eval_wikibio_rouge_score": 0.3137852660478885, "eval_wikibio_runtime": 30.6987, "eval_wikibio_samples_per_second": 16.287, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.336704887513843, "eval_wikibio_token_set_f1_sem": 0.011582732699524477, "eval_wikibio_token_set_precision": 0.29784160298159834, "eval_wikibio_token_set_recall": 0.43082672460978566, "eval_wikibio_true_num_tokens": 99.3359375, "step": 800000 }, { "epoch": 46.79183482482307, "eval_msmarco_accuracy": 0.388890625, "eval_msmarco_bleu_score": 20.80232696388731, "eval_msmarco_bleu_score_sem": 1.655678281217969, "eval_msmarco_emb_cos_sim": 0.8134458065032959, "eval_msmarco_emb_cos_sim_sem": 0.014074575155973434, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7423689365386963, "eval_msmarco_n_ngrams_match_1": 29.96875, "eval_msmarco_n_ngrams_match_2": 14.5546875, "eval_msmarco_n_ngrams_match_3": 8.796875, "eval_msmarco_num_pred_words": 62.0390625, "eval_msmarco_num_true_words": 62.8828125, "eval_msmarco_perplexity": 5.710856066658241, "eval_msmarco_pred_num_tokens": 82.6015625, "eval_msmarco_rouge_score": 0.4734446926804231, "eval_msmarco_runtime": 25.7415, "eval_msmarco_samples_per_second": 19.424, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.49505317510139335, "eval_msmarco_token_set_f1_sem": 0.014590617234582513, "eval_msmarco_token_set_precision": 0.46004875140946405, "eval_msmarco_token_set_recall": 0.5485366936287286, "eval_msmarco_true_num_tokens": 80.53125, "step": 800000 }, { "epoch": 46.797683804176174, "grad_norm": 0.17489273846149445, "learning_rate": 0.001, "loss": 1.8156, "step": 800100 }, { "epoch": 46.80353278352928, "grad_norm": 0.1846793293952942, "learning_rate": 0.001, "loss": 1.8187, "step": 800200 }, { "epoch": 46.809381762882374, "grad_norm": 0.19262106716632843, "learning_rate": 0.001, "loss": 1.8201, "step": 800300 }, { "epoch": 46.81523074223548, "grad_norm": 0.19352740049362183, "learning_rate": 0.001, "loss": 1.8207, "step": 800400 }, { "epoch": 46.82107972158858, "grad_norm": 0.20636197924613953, "learning_rate": 0.001, "loss": 1.8262, "step": 800500 }, { "epoch": 46.826928700941686, "grad_norm": 0.26708874106407166, "learning_rate": 0.001, "loss": 1.8207, "step": 800600 }, { "epoch": 46.83277768029479, "grad_norm": 0.20078976452350616, "learning_rate": 0.001, "loss": 1.8173, "step": 800700 }, { "epoch": 46.83862665964789, "grad_norm": 0.21084590256214142, "learning_rate": 0.001, "loss": 1.8174, "step": 800800 }, { "epoch": 46.844475639001, "grad_norm": 0.1745433509349823, "learning_rate": 0.001, "loss": 1.8179, "step": 800900 }, { "epoch": 46.850324618354094, "grad_norm": 0.2851060926914215, "learning_rate": 0.001, "loss": 1.8181, "step": 801000 }, { "epoch": 46.8561735977072, "grad_norm": 0.20713898539543152, "learning_rate": 0.001, "loss": 1.8214, "step": 801100 }, { "epoch": 46.8620225770603, "grad_norm": 0.24154040217399597, "learning_rate": 0.001, "loss": 1.827, "step": 801200 }, { "epoch": 46.867871556413405, "grad_norm": 0.20041748881340027, "learning_rate": 0.001, "loss": 1.8253, "step": 801300 }, { "epoch": 46.87372053576651, "grad_norm": 0.22993351519107819, "learning_rate": 0.001, "loss": 1.8173, "step": 801400 }, { "epoch": 46.87956951511961, "grad_norm": 0.2903001010417938, "learning_rate": 0.001, "loss": 1.8255, "step": 801500 }, { "epoch": 46.88541849447272, "grad_norm": 0.2691403329372406, "learning_rate": 0.001, "loss": 1.8211, "step": 801600 }, { "epoch": 46.89126747382582, "grad_norm": 0.19583643972873688, "learning_rate": 0.001, "loss": 1.8175, "step": 801700 }, { "epoch": 46.89711645317892, "grad_norm": 0.2487078458070755, "learning_rate": 0.001, "loss": 1.8218, "step": 801800 }, { "epoch": 46.90296543253202, "grad_norm": 0.28375980257987976, "learning_rate": 0.001, "loss": 1.8247, "step": 801900 }, { "epoch": 46.908814411885125, "grad_norm": 0.3289111256599426, "learning_rate": 0.001, "loss": 1.8204, "step": 802000 }, { "epoch": 46.91466339123823, "grad_norm": 0.19881446659564972, "learning_rate": 0.001, "loss": 1.8278, "step": 802100 }, { "epoch": 46.92051237059133, "grad_norm": 0.23203638195991516, "learning_rate": 0.001, "loss": 1.8205, "step": 802200 }, { "epoch": 46.926361349944436, "grad_norm": 0.18210257589817047, "learning_rate": 0.001, "loss": 1.8191, "step": 802300 }, { "epoch": 46.93221032929754, "grad_norm": 0.20055541396141052, "learning_rate": 0.001, "loss": 1.8188, "step": 802400 }, { "epoch": 46.938059308650644, "grad_norm": 0.23859788477420807, "learning_rate": 0.001, "loss": 1.8219, "step": 802500 }, { "epoch": 46.94390828800374, "grad_norm": 0.20092856884002686, "learning_rate": 0.001, "loss": 1.8216, "step": 802600 }, { "epoch": 46.949757267356844, "grad_norm": 0.2262505441904068, "learning_rate": 0.001, "loss": 1.821, "step": 802700 }, { "epoch": 46.95560624670995, "grad_norm": 0.20521065592765808, "learning_rate": 0.001, "loss": 1.8211, "step": 802800 }, { "epoch": 46.96145522606305, "grad_norm": 0.21644467115402222, "learning_rate": 0.001, "loss": 1.8225, "step": 802900 }, { "epoch": 46.967304205416156, "grad_norm": 0.218624085187912, "learning_rate": 0.001, "loss": 1.8215, "step": 803000 }, { "epoch": 46.97315318476926, "grad_norm": 0.21603748202323914, "learning_rate": 0.001, "loss": 1.826, "step": 803100 }, { "epoch": 46.97900216412236, "grad_norm": 0.2025124430656433, "learning_rate": 0.001, "loss": 1.8149, "step": 803200 }, { "epoch": 46.98485114347547, "grad_norm": 0.20178894698619843, "learning_rate": 0.001, "loss": 1.82, "step": 803300 }, { "epoch": 46.990700122828564, "grad_norm": 0.28356242179870605, "learning_rate": 0.001, "loss": 1.8225, "step": 803400 }, { "epoch": 46.99654910218167, "grad_norm": 0.24473345279693604, "learning_rate": 0.001, "loss": 1.829, "step": 803500 }, { "epoch": 47.00239808153477, "grad_norm": 0.178097665309906, "learning_rate": 0.001, "loss": 1.8184, "step": 803600 }, { "epoch": 47.008247060887875, "grad_norm": 0.19266295433044434, "learning_rate": 0.001, "loss": 1.8109, "step": 803700 }, { "epoch": 47.01409604024098, "grad_norm": 0.16617758572101593, "learning_rate": 0.001, "loss": 1.8056, "step": 803800 }, { "epoch": 47.01994501959408, "grad_norm": 0.16979239881038666, "learning_rate": 0.001, "loss": 1.8093, "step": 803900 }, { "epoch": 47.02579399894719, "grad_norm": 0.16482548415660858, "learning_rate": 0.001, "loss": 1.8093, "step": 804000 }, { "epoch": 47.03164297830028, "grad_norm": 0.1621529906988144, "learning_rate": 0.001, "loss": 1.8144, "step": 804100 }, { "epoch": 47.03749195765339, "grad_norm": 0.2077207714319229, "learning_rate": 0.001, "loss": 1.8107, "step": 804200 }, { "epoch": 47.04334093700649, "grad_norm": 0.17232300341129303, "learning_rate": 0.001, "loss": 1.81, "step": 804300 }, { "epoch": 47.049189916359595, "grad_norm": 0.14081361889839172, "learning_rate": 0.001, "loss": 1.8112, "step": 804400 }, { "epoch": 47.0550388957127, "grad_norm": 0.1650649905204773, "learning_rate": 0.001, "loss": 1.8077, "step": 804500 }, { "epoch": 47.0608878750658, "grad_norm": 0.18549376726150513, "learning_rate": 0.001, "loss": 1.808, "step": 804600 }, { "epoch": 47.066736854418906, "grad_norm": 0.17308758199214935, "learning_rate": 0.001, "loss": 1.8064, "step": 804700 }, { "epoch": 47.07258583377201, "grad_norm": 0.17040257155895233, "learning_rate": 0.001, "loss": 1.8077, "step": 804800 }, { "epoch": 47.07843481312511, "grad_norm": 0.20229144394397736, "learning_rate": 0.001, "loss": 1.8138, "step": 804900 }, { "epoch": 47.08428379247821, "grad_norm": 0.20065850019454956, "learning_rate": 0.001, "loss": 1.8144, "step": 805000 }, { "epoch": 47.090132771831314, "grad_norm": 0.15490210056304932, "learning_rate": 0.001, "loss": 1.8092, "step": 805100 }, { "epoch": 47.09598175118442, "grad_norm": 0.19233998656272888, "learning_rate": 0.001, "loss": 1.813, "step": 805200 }, { "epoch": 47.10183073053752, "grad_norm": 0.16065827012062073, "learning_rate": 0.001, "loss": 1.8111, "step": 805300 }, { "epoch": 47.107679709890625, "grad_norm": 0.19950367510318756, "learning_rate": 0.001, "loss": 1.8073, "step": 805400 }, { "epoch": 47.11352868924373, "grad_norm": 0.17894645035266876, "learning_rate": 0.001, "loss": 1.812, "step": 805500 }, { "epoch": 47.11937766859683, "grad_norm": 0.17631925642490387, "learning_rate": 0.001, "loss": 1.8118, "step": 805600 }, { "epoch": 47.12522664794993, "grad_norm": 0.17956703901290894, "learning_rate": 0.001, "loss": 1.8088, "step": 805700 }, { "epoch": 47.131075627303034, "grad_norm": 0.1867772787809372, "learning_rate": 0.001, "loss": 1.814, "step": 805800 }, { "epoch": 47.13692460665614, "grad_norm": 0.23897121846675873, "learning_rate": 0.001, "loss": 1.8085, "step": 805900 }, { "epoch": 47.14277358600924, "grad_norm": 0.17730779945850372, "learning_rate": 0.001, "loss": 1.8091, "step": 806000 }, { "epoch": 47.148622565362345, "grad_norm": 0.17549295723438263, "learning_rate": 0.001, "loss": 1.8128, "step": 806100 }, { "epoch": 47.15447154471545, "grad_norm": 0.14071576297283173, "learning_rate": 0.001, "loss": 1.8103, "step": 806200 }, { "epoch": 47.16032052406855, "grad_norm": 0.16628411412239075, "learning_rate": 0.001, "loss": 1.8145, "step": 806300 }, { "epoch": 47.166169503421656, "grad_norm": 0.19433750212192535, "learning_rate": 0.001, "loss": 1.8124, "step": 806400 }, { "epoch": 47.17201848277475, "grad_norm": 0.17163418233394623, "learning_rate": 0.001, "loss": 1.8115, "step": 806500 }, { "epoch": 47.17786746212786, "grad_norm": 0.22131845355033875, "learning_rate": 0.001, "loss": 1.8084, "step": 806600 }, { "epoch": 47.18371644148096, "grad_norm": 0.2303607314825058, "learning_rate": 0.001, "loss": 1.8058, "step": 806700 }, { "epoch": 47.189565420834064, "grad_norm": 0.157476007938385, "learning_rate": 0.001, "loss": 1.8173, "step": 806800 }, { "epoch": 47.19541440018717, "grad_norm": 0.20149920880794525, "learning_rate": 0.001, "loss": 1.8083, "step": 806900 }, { "epoch": 47.20126337954027, "grad_norm": 0.16566914319992065, "learning_rate": 0.001, "loss": 1.8089, "step": 807000 }, { "epoch": 47.207112358893376, "grad_norm": 0.22258882224559784, "learning_rate": 0.001, "loss": 1.8163, "step": 807100 }, { "epoch": 47.21296133824647, "grad_norm": 0.19628910720348358, "learning_rate": 0.001, "loss": 1.807, "step": 807200 }, { "epoch": 47.218810317599576, "grad_norm": 0.18856655061244965, "learning_rate": 0.001, "loss": 1.8122, "step": 807300 }, { "epoch": 47.22465929695268, "grad_norm": 0.1613379269838333, "learning_rate": 0.001, "loss": 1.8139, "step": 807400 }, { "epoch": 47.230508276305784, "grad_norm": 0.17655378580093384, "learning_rate": 0.001, "loss": 1.8132, "step": 807500 }, { "epoch": 47.23635725565889, "grad_norm": 0.1435765027999878, "learning_rate": 0.001, "loss": 1.8064, "step": 807600 }, { "epoch": 47.24220623501199, "grad_norm": 0.2081281542778015, "learning_rate": 0.001, "loss": 1.8101, "step": 807700 }, { "epoch": 47.248055214365095, "grad_norm": 0.18627986311912537, "learning_rate": 0.001, "loss": 1.8163, "step": 807800 }, { "epoch": 47.2539041937182, "grad_norm": 0.21631452441215515, "learning_rate": 0.001, "loss": 1.8173, "step": 807900 }, { "epoch": 47.259753173071296, "grad_norm": 0.1397199183702469, "learning_rate": 0.001, "loss": 1.8163, "step": 808000 }, { "epoch": 47.2656021524244, "grad_norm": 0.17028330266475677, "learning_rate": 0.001, "loss": 1.8075, "step": 808100 }, { "epoch": 47.2714511317775, "grad_norm": 0.19419600069522858, "learning_rate": 0.001, "loss": 1.8143, "step": 808200 }, { "epoch": 47.27730011113061, "grad_norm": 0.13639608025550842, "learning_rate": 0.001, "loss": 1.8121, "step": 808300 }, { "epoch": 47.28314909048371, "grad_norm": 0.18621036410331726, "learning_rate": 0.001, "loss": 1.8128, "step": 808400 }, { "epoch": 47.288998069836815, "grad_norm": 0.20031781494617462, "learning_rate": 0.001, "loss": 1.8123, "step": 808500 }, { "epoch": 47.29484704918992, "grad_norm": 0.17850416898727417, "learning_rate": 0.001, "loss": 1.8077, "step": 808600 }, { "epoch": 47.30069602854302, "grad_norm": 0.18492169678211212, "learning_rate": 0.001, "loss": 1.8087, "step": 808700 }, { "epoch": 47.30654500789612, "grad_norm": 0.19855549931526184, "learning_rate": 0.001, "loss": 1.8146, "step": 808800 }, { "epoch": 47.31239398724922, "grad_norm": 0.17364662885665894, "learning_rate": 0.001, "loss": 1.8186, "step": 808900 }, { "epoch": 47.31824296660233, "grad_norm": 0.12968595325946808, "learning_rate": 0.001, "loss": 1.8135, "step": 809000 }, { "epoch": 47.32409194595543, "grad_norm": 0.19588175415992737, "learning_rate": 0.001, "loss": 1.8071, "step": 809100 }, { "epoch": 47.329940925308534, "grad_norm": 0.14277508854866028, "learning_rate": 0.001, "loss": 1.8063, "step": 809200 }, { "epoch": 47.33578990466164, "grad_norm": 0.15841007232666016, "learning_rate": 0.001, "loss": 1.8093, "step": 809300 }, { "epoch": 47.34163888401474, "grad_norm": 0.16428320109844208, "learning_rate": 0.001, "loss": 1.8133, "step": 809400 }, { "epoch": 47.347487863367846, "grad_norm": 0.1772071123123169, "learning_rate": 0.001, "loss": 1.8157, "step": 809500 }, { "epoch": 47.35333684272094, "grad_norm": 0.20605860650539398, "learning_rate": 0.001, "loss": 1.8146, "step": 809600 }, { "epoch": 47.359185822074046, "grad_norm": 0.16375987231731415, "learning_rate": 0.001, "loss": 1.814, "step": 809700 }, { "epoch": 47.36503480142715, "grad_norm": 0.17678235471248627, "learning_rate": 0.001, "loss": 1.8097, "step": 809800 }, { "epoch": 47.370883780780254, "grad_norm": 0.28354647755622864, "learning_rate": 0.001, "loss": 1.8107, "step": 809900 }, { "epoch": 47.37673276013336, "grad_norm": 0.17668603360652924, "learning_rate": 0.001, "loss": 1.8151, "step": 810000 }, { "epoch": 47.38258173948646, "grad_norm": 0.13778048753738403, "learning_rate": 0.001, "loss": 1.8096, "step": 810100 }, { "epoch": 47.388430718839565, "grad_norm": 0.2309183031320572, "learning_rate": 0.001, "loss": 1.814, "step": 810200 }, { "epoch": 47.39427969819266, "grad_norm": 0.18926066160202026, "learning_rate": 0.001, "loss": 1.8111, "step": 810300 }, { "epoch": 47.400128677545766, "grad_norm": 0.17686806619167328, "learning_rate": 0.001, "loss": 1.8104, "step": 810400 }, { "epoch": 47.40597765689887, "grad_norm": 0.20249240100383759, "learning_rate": 0.001, "loss": 1.812, "step": 810500 }, { "epoch": 47.41182663625197, "grad_norm": 0.19766955077648163, "learning_rate": 0.001, "loss": 1.8161, "step": 810600 }, { "epoch": 47.41767561560508, "grad_norm": 0.17980462312698364, "learning_rate": 0.001, "loss": 1.8113, "step": 810700 }, { "epoch": 47.42352459495818, "grad_norm": 0.15536952018737793, "learning_rate": 0.001, "loss": 1.8167, "step": 810800 }, { "epoch": 47.429373574311285, "grad_norm": 0.15627072751522064, "learning_rate": 0.001, "loss": 1.8112, "step": 810900 }, { "epoch": 47.43522255366439, "grad_norm": 0.19499382376670837, "learning_rate": 0.001, "loss": 1.8139, "step": 811000 }, { "epoch": 47.441071533017485, "grad_norm": 0.15339449048042297, "learning_rate": 0.001, "loss": 1.8139, "step": 811100 }, { "epoch": 47.44692051237059, "grad_norm": 0.15136678516864777, "learning_rate": 0.001, "loss": 1.8213, "step": 811200 }, { "epoch": 47.45276949172369, "grad_norm": 0.18423965573310852, "learning_rate": 0.001, "loss": 1.818, "step": 811300 }, { "epoch": 47.4586184710768, "grad_norm": 0.21116992831230164, "learning_rate": 0.001, "loss": 1.8099, "step": 811400 }, { "epoch": 47.4644674504299, "grad_norm": 0.17987018823623657, "learning_rate": 0.001, "loss": 1.8194, "step": 811500 }, { "epoch": 47.470316429783004, "grad_norm": 0.1622767299413681, "learning_rate": 0.001, "loss": 1.8164, "step": 811600 }, { "epoch": 47.47616540913611, "grad_norm": 0.18579226732254028, "learning_rate": 0.001, "loss": 1.8185, "step": 811700 }, { "epoch": 47.48201438848921, "grad_norm": 0.17451117932796478, "learning_rate": 0.001, "loss": 1.8108, "step": 811800 }, { "epoch": 47.48786336784231, "grad_norm": 0.1476186364889145, "learning_rate": 0.001, "loss": 1.812, "step": 811900 }, { "epoch": 47.49371234719541, "grad_norm": 0.19337715208530426, "learning_rate": 0.001, "loss": 1.8105, "step": 812000 }, { "epoch": 47.499561326548516, "grad_norm": 0.2684129774570465, "learning_rate": 0.001, "loss": 1.8166, "step": 812100 }, { "epoch": 47.50541030590162, "grad_norm": 0.24721361696720123, "learning_rate": 0.001, "loss": 1.8185, "step": 812200 }, { "epoch": 47.511259285254724, "grad_norm": 0.1598937064409256, "learning_rate": 0.001, "loss": 1.8176, "step": 812300 }, { "epoch": 47.51710826460783, "grad_norm": 0.13973382115364075, "learning_rate": 0.001, "loss": 1.8197, "step": 812400 }, { "epoch": 47.52295724396093, "grad_norm": 0.2207307517528534, "learning_rate": 0.001, "loss": 1.8136, "step": 812500 }, { "epoch": 47.528806223314035, "grad_norm": 0.2145039588212967, "learning_rate": 0.001, "loss": 1.8147, "step": 812600 }, { "epoch": 47.53465520266713, "grad_norm": 0.20245669782161713, "learning_rate": 0.001, "loss": 1.8178, "step": 812700 }, { "epoch": 47.540504182020236, "grad_norm": 0.17532771825790405, "learning_rate": 0.001, "loss": 1.8174, "step": 812800 }, { "epoch": 47.54635316137334, "grad_norm": 0.21795253455638885, "learning_rate": 0.001, "loss": 1.8154, "step": 812900 }, { "epoch": 47.55220214072644, "grad_norm": 0.2133256494998932, "learning_rate": 0.001, "loss": 1.8163, "step": 813000 }, { "epoch": 47.55805112007955, "grad_norm": 0.14633941650390625, "learning_rate": 0.001, "loss": 1.8133, "step": 813100 }, { "epoch": 47.56390009943265, "grad_norm": 0.22632011771202087, "learning_rate": 0.001, "loss": 1.8178, "step": 813200 }, { "epoch": 47.569749078785755, "grad_norm": 0.16543784737586975, "learning_rate": 0.001, "loss": 1.8149, "step": 813300 }, { "epoch": 47.57559805813885, "grad_norm": 0.20197901129722595, "learning_rate": 0.001, "loss": 1.8204, "step": 813400 }, { "epoch": 47.581447037491955, "grad_norm": 0.18467286229133606, "learning_rate": 0.001, "loss": 1.8186, "step": 813500 }, { "epoch": 47.58729601684506, "grad_norm": 0.16083578765392303, "learning_rate": 0.001, "loss": 1.8172, "step": 813600 }, { "epoch": 47.59314499619816, "grad_norm": 0.1924975961446762, "learning_rate": 0.001, "loss": 1.816, "step": 813700 }, { "epoch": 47.59899397555127, "grad_norm": 0.18993805348873138, "learning_rate": 0.001, "loss": 1.8187, "step": 813800 }, { "epoch": 47.60484295490437, "grad_norm": 0.19467872381210327, "learning_rate": 0.001, "loss": 1.8192, "step": 813900 }, { "epoch": 47.610691934257474, "grad_norm": 0.15097935497760773, "learning_rate": 0.001, "loss": 1.8165, "step": 814000 }, { "epoch": 47.61654091361058, "grad_norm": 0.15828190743923187, "learning_rate": 0.001, "loss": 1.8149, "step": 814100 }, { "epoch": 47.622389892963675, "grad_norm": 0.1655072271823883, "learning_rate": 0.001, "loss": 1.8132, "step": 814200 }, { "epoch": 47.62823887231678, "grad_norm": 0.17820324003696442, "learning_rate": 0.001, "loss": 1.8142, "step": 814300 }, { "epoch": 47.63408785166988, "grad_norm": 0.16169139742851257, "learning_rate": 0.001, "loss": 1.8205, "step": 814400 }, { "epoch": 47.639936831022986, "grad_norm": 0.19360630214214325, "learning_rate": 0.001, "loss": 1.8132, "step": 814500 }, { "epoch": 47.64578581037609, "grad_norm": 0.24475806951522827, "learning_rate": 0.001, "loss": 1.8162, "step": 814600 }, { "epoch": 47.651634789729194, "grad_norm": 0.16594591736793518, "learning_rate": 0.001, "loss": 1.8165, "step": 814700 }, { "epoch": 47.6574837690823, "grad_norm": 0.20857609808444977, "learning_rate": 0.001, "loss": 1.8132, "step": 814800 }, { "epoch": 47.6633327484354, "grad_norm": 0.1506120264530182, "learning_rate": 0.001, "loss": 1.817, "step": 814900 }, { "epoch": 47.6691817277885, "grad_norm": 0.16605593264102936, "learning_rate": 0.001, "loss": 1.8138, "step": 815000 }, { "epoch": 47.6750307071416, "grad_norm": 0.16614791750907898, "learning_rate": 0.001, "loss": 1.8235, "step": 815100 }, { "epoch": 47.680879686494706, "grad_norm": 0.2263612300157547, "learning_rate": 0.001, "loss": 1.8148, "step": 815200 }, { "epoch": 47.68672866584781, "grad_norm": 0.16438673436641693, "learning_rate": 0.001, "loss": 1.8138, "step": 815300 }, { "epoch": 47.69257764520091, "grad_norm": 0.14718270301818848, "learning_rate": 0.001, "loss": 1.8129, "step": 815400 }, { "epoch": 47.69842662455402, "grad_norm": 0.17509125173091888, "learning_rate": 0.001, "loss": 1.8183, "step": 815500 }, { "epoch": 47.70427560390712, "grad_norm": 0.17208990454673767, "learning_rate": 0.001, "loss": 1.8192, "step": 815600 }, { "epoch": 47.710124583260225, "grad_norm": 0.15620057284832, "learning_rate": 0.001, "loss": 1.819, "step": 815700 }, { "epoch": 47.71597356261332, "grad_norm": 0.1572543978691101, "learning_rate": 0.001, "loss": 1.8173, "step": 815800 }, { "epoch": 47.721822541966425, "grad_norm": 0.17825107276439667, "learning_rate": 0.001, "loss": 1.819, "step": 815900 }, { "epoch": 47.72767152131953, "grad_norm": 0.15865708887577057, "learning_rate": 0.001, "loss": 1.8154, "step": 816000 }, { "epoch": 47.73352050067263, "grad_norm": 0.18437685072422028, "learning_rate": 0.001, "loss": 1.8185, "step": 816100 }, { "epoch": 47.739369480025736, "grad_norm": 0.2248290777206421, "learning_rate": 0.001, "loss": 1.8203, "step": 816200 }, { "epoch": 47.74521845937884, "grad_norm": 0.3517996668815613, "learning_rate": 0.001, "loss": 1.8179, "step": 816300 }, { "epoch": 47.751067438731944, "grad_norm": 0.15993961691856384, "learning_rate": 0.001, "loss": 1.8202, "step": 816400 }, { "epoch": 47.75691641808504, "grad_norm": 0.15192870795726776, "learning_rate": 0.001, "loss": 1.8167, "step": 816500 }, { "epoch": 47.762765397438145, "grad_norm": 0.23452989757061005, "learning_rate": 0.001, "loss": 1.8154, "step": 816600 }, { "epoch": 47.76861437679125, "grad_norm": 0.2248372584581375, "learning_rate": 0.001, "loss": 1.8196, "step": 816700 }, { "epoch": 47.77446335614435, "grad_norm": 0.2284335494041443, "learning_rate": 0.001, "loss": 1.8171, "step": 816800 }, { "epoch": 47.780312335497456, "grad_norm": 0.20274485647678375, "learning_rate": 0.001, "loss": 1.822, "step": 816900 }, { "epoch": 47.78616131485056, "grad_norm": 0.17326077818870544, "learning_rate": 0.001, "loss": 1.8186, "step": 817000 }, { "epoch": 47.79201029420366, "grad_norm": 0.17156296968460083, "learning_rate": 0.001, "loss": 1.8172, "step": 817100 }, { "epoch": 47.79785927355677, "grad_norm": 0.17666518688201904, "learning_rate": 0.001, "loss": 1.8208, "step": 817200 }, { "epoch": 47.803708252909864, "grad_norm": 0.20604844391345978, "learning_rate": 0.001, "loss": 1.8223, "step": 817300 }, { "epoch": 47.80955723226297, "grad_norm": 0.14419519901275635, "learning_rate": 0.001, "loss": 1.8124, "step": 817400 }, { "epoch": 47.81540621161607, "grad_norm": 0.13757683336734772, "learning_rate": 0.001, "loss": 1.8196, "step": 817500 }, { "epoch": 47.821255190969175, "grad_norm": 0.19733308255672455, "learning_rate": 0.001, "loss": 1.8118, "step": 817600 }, { "epoch": 47.82710417032228, "grad_norm": 0.193852499127388, "learning_rate": 0.001, "loss": 1.8198, "step": 817700 }, { "epoch": 47.83295314967538, "grad_norm": 0.2321309596300125, "learning_rate": 0.001, "loss": 1.8211, "step": 817800 }, { "epoch": 47.83880212902849, "grad_norm": 0.2081948220729828, "learning_rate": 0.001, "loss": 1.8196, "step": 817900 }, { "epoch": 47.84465110838159, "grad_norm": 0.16749289631843567, "learning_rate": 0.001, "loss": 1.8209, "step": 818000 }, { "epoch": 47.85050008773469, "grad_norm": 0.21346338093280792, "learning_rate": 0.001, "loss": 1.8172, "step": 818100 }, { "epoch": 47.85634906708779, "grad_norm": 0.18094196915626526, "learning_rate": 0.001, "loss": 1.8199, "step": 818200 }, { "epoch": 47.862198046440895, "grad_norm": 0.15385788679122925, "learning_rate": 0.001, "loss": 1.8127, "step": 818300 }, { "epoch": 47.868047025794, "grad_norm": 0.18450024724006653, "learning_rate": 0.001, "loss": 1.8123, "step": 818400 }, { "epoch": 47.8738960051471, "grad_norm": 0.226291224360466, "learning_rate": 0.001, "loss": 1.8206, "step": 818500 }, { "epoch": 47.879744984500206, "grad_norm": 0.15579518675804138, "learning_rate": 0.001, "loss": 1.8189, "step": 818600 }, { "epoch": 47.88559396385331, "grad_norm": 0.1909054070711136, "learning_rate": 0.001, "loss": 1.8219, "step": 818700 }, { "epoch": 47.891442943206414, "grad_norm": 0.15391422808170319, "learning_rate": 0.001, "loss": 1.8203, "step": 818800 }, { "epoch": 47.89729192255951, "grad_norm": 0.17214883863925934, "learning_rate": 0.001, "loss": 1.8119, "step": 818900 }, { "epoch": 47.903140901912614, "grad_norm": 0.20378561317920685, "learning_rate": 0.001, "loss": 1.8214, "step": 819000 }, { "epoch": 47.90898988126572, "grad_norm": 0.2082175314426422, "learning_rate": 0.001, "loss": 1.8207, "step": 819100 }, { "epoch": 47.91483886061882, "grad_norm": 0.17553676664829254, "learning_rate": 0.001, "loss": 1.8229, "step": 819200 }, { "epoch": 47.920687839971926, "grad_norm": 0.14484579861164093, "learning_rate": 0.001, "loss": 1.8235, "step": 819300 }, { "epoch": 47.92653681932503, "grad_norm": 0.21929457783699036, "learning_rate": 0.001, "loss": 1.826, "step": 819400 }, { "epoch": 47.93238579867813, "grad_norm": 0.16438443958759308, "learning_rate": 0.001, "loss": 1.8163, "step": 819500 }, { "epoch": 47.93823477803123, "grad_norm": 0.16356073319911957, "learning_rate": 0.001, "loss": 1.8179, "step": 819600 }, { "epoch": 47.944083757384334, "grad_norm": 0.1881217658519745, "learning_rate": 0.001, "loss": 1.8111, "step": 819700 }, { "epoch": 47.94993273673744, "grad_norm": 0.18709930777549744, "learning_rate": 0.001, "loss": 1.8217, "step": 819800 }, { "epoch": 47.95578171609054, "grad_norm": 0.20426081120967865, "learning_rate": 0.001, "loss": 1.8215, "step": 819900 }, { "epoch": 47.961630695443645, "grad_norm": 0.1577775627374649, "learning_rate": 0.001, "loss": 1.8213, "step": 820000 }, { "epoch": 47.96747967479675, "grad_norm": 0.16439591348171234, "learning_rate": 0.001, "loss": 1.8163, "step": 820100 }, { "epoch": 47.97332865414985, "grad_norm": 0.19410206377506256, "learning_rate": 0.001, "loss": 1.8175, "step": 820200 }, { "epoch": 47.97917763350296, "grad_norm": 0.1855180263519287, "learning_rate": 0.001, "loss": 1.8239, "step": 820300 }, { "epoch": 47.98502661285605, "grad_norm": 0.18078850209712982, "learning_rate": 0.001, "loss": 1.8204, "step": 820400 }, { "epoch": 47.99087559220916, "grad_norm": 0.18508058786392212, "learning_rate": 0.001, "loss": 1.8234, "step": 820500 }, { "epoch": 47.99672457156226, "grad_norm": 0.1819600611925125, "learning_rate": 0.001, "loss": 1.8221, "step": 820600 }, { "epoch": 48.002573550915365, "grad_norm": 0.17221248149871826, "learning_rate": 0.001, "loss": 1.8231, "step": 820700 }, { "epoch": 48.00842253026847, "grad_norm": 0.17481397092342377, "learning_rate": 0.001, "loss": 1.8062, "step": 820800 }, { "epoch": 48.01427150962157, "grad_norm": 0.1787734031677246, "learning_rate": 0.001, "loss": 1.8064, "step": 820900 }, { "epoch": 48.020120488974676, "grad_norm": 0.16270920634269714, "learning_rate": 0.001, "loss": 1.8021, "step": 821000 }, { "epoch": 48.02596946832778, "grad_norm": 0.2573698163032532, "learning_rate": 0.001, "loss": 1.8068, "step": 821100 }, { "epoch": 48.03181844768088, "grad_norm": 0.17308703064918518, "learning_rate": 0.001, "loss": 1.8101, "step": 821200 }, { "epoch": 48.03766742703398, "grad_norm": 0.20357796549797058, "learning_rate": 0.001, "loss": 1.8068, "step": 821300 }, { "epoch": 48.043516406387084, "grad_norm": 0.15474793314933777, "learning_rate": 0.001, "loss": 1.8072, "step": 821400 }, { "epoch": 48.04936538574019, "grad_norm": 0.17112520337104797, "learning_rate": 0.001, "loss": 1.8057, "step": 821500 }, { "epoch": 48.05521436509329, "grad_norm": 0.19159181416034698, "learning_rate": 0.001, "loss": 1.803, "step": 821600 }, { "epoch": 48.061063344446396, "grad_norm": 0.1709475964307785, "learning_rate": 0.001, "loss": 1.8036, "step": 821700 }, { "epoch": 48.0669123237995, "grad_norm": 0.16681043803691864, "learning_rate": 0.001, "loss": 1.8047, "step": 821800 }, { "epoch": 48.0727613031526, "grad_norm": 0.2104228287935257, "learning_rate": 0.001, "loss": 1.8004, "step": 821900 }, { "epoch": 48.0786102825057, "grad_norm": 0.28157174587249756, "learning_rate": 0.001, "loss": 1.808, "step": 822000 }, { "epoch": 48.084459261858804, "grad_norm": 0.1743306666612625, "learning_rate": 0.001, "loss": 1.807, "step": 822100 }, { "epoch": 48.09030824121191, "grad_norm": 0.22410953044891357, "learning_rate": 0.001, "loss": 1.8026, "step": 822200 }, { "epoch": 48.09615722056501, "grad_norm": 0.22414720058441162, "learning_rate": 0.001, "loss": 1.8045, "step": 822300 }, { "epoch": 48.102006199918115, "grad_norm": 0.15827056765556335, "learning_rate": 0.001, "loss": 1.8083, "step": 822400 }, { "epoch": 48.10785517927122, "grad_norm": 0.19752220809459686, "learning_rate": 0.001, "loss": 1.8135, "step": 822500 }, { "epoch": 48.11370415862432, "grad_norm": 0.20458436012268066, "learning_rate": 0.001, "loss": 1.8059, "step": 822600 }, { "epoch": 48.11955313797742, "grad_norm": 0.164154052734375, "learning_rate": 0.001, "loss": 1.8054, "step": 822700 }, { "epoch": 48.12540211733052, "grad_norm": 0.20363646745681763, "learning_rate": 0.001, "loss": 1.8086, "step": 822800 }, { "epoch": 48.13125109668363, "grad_norm": 0.18673661351203918, "learning_rate": 0.001, "loss": 1.8081, "step": 822900 }, { "epoch": 48.13710007603673, "grad_norm": 0.18358497321605682, "learning_rate": 0.001, "loss": 1.8056, "step": 823000 }, { "epoch": 48.142949055389835, "grad_norm": 0.15535755455493927, "learning_rate": 0.001, "loss": 1.8054, "step": 823100 }, { "epoch": 48.14879803474294, "grad_norm": 0.180524080991745, "learning_rate": 0.001, "loss": 1.8093, "step": 823200 }, { "epoch": 48.15464701409604, "grad_norm": 0.23098301887512207, "learning_rate": 0.001, "loss": 1.8125, "step": 823300 }, { "epoch": 48.160495993449146, "grad_norm": 0.16814099252223969, "learning_rate": 0.001, "loss": 1.8091, "step": 823400 }, { "epoch": 48.16634497280224, "grad_norm": 0.2652949094772339, "learning_rate": 0.001, "loss": 1.8136, "step": 823500 }, { "epoch": 48.17219395215535, "grad_norm": 0.20490334928035736, "learning_rate": 0.001, "loss": 1.808, "step": 823600 }, { "epoch": 48.17804293150845, "grad_norm": 0.15057282149791718, "learning_rate": 0.001, "loss": 1.8075, "step": 823700 }, { "epoch": 48.183891910861554, "grad_norm": 0.14460617303848267, "learning_rate": 0.001, "loss": 1.8081, "step": 823800 }, { "epoch": 48.18974089021466, "grad_norm": 0.18623536825180054, "learning_rate": 0.001, "loss": 1.8173, "step": 823900 }, { "epoch": 48.19558986956776, "grad_norm": 0.1488414853811264, "learning_rate": 0.001, "loss": 1.8105, "step": 824000 }, { "epoch": 48.201438848920866, "grad_norm": 0.18252088129520416, "learning_rate": 0.001, "loss": 1.8109, "step": 824100 }, { "epoch": 48.20728782827397, "grad_norm": 0.21994706988334656, "learning_rate": 0.001, "loss": 1.8084, "step": 824200 }, { "epoch": 48.213136807627066, "grad_norm": 0.17923173308372498, "learning_rate": 0.001, "loss": 1.8081, "step": 824300 }, { "epoch": 48.21898578698017, "grad_norm": 0.16911153495311737, "learning_rate": 0.001, "loss": 1.8057, "step": 824400 }, { "epoch": 48.224834766333274, "grad_norm": 0.2481652945280075, "learning_rate": 0.001, "loss": 1.8093, "step": 824500 }, { "epoch": 48.23068374568638, "grad_norm": 0.19666460156440735, "learning_rate": 0.001, "loss": 1.8104, "step": 824600 }, { "epoch": 48.23653272503948, "grad_norm": 0.21080072224140167, "learning_rate": 0.001, "loss": 1.8135, "step": 824700 }, { "epoch": 48.242381704392585, "grad_norm": 0.18418142199516296, "learning_rate": 0.001, "loss": 1.8136, "step": 824800 }, { "epoch": 48.24823068374569, "grad_norm": 0.1637101024389267, "learning_rate": 0.001, "loss": 1.814, "step": 824900 }, { "epoch": 48.25407966309879, "grad_norm": 0.17831259965896606, "learning_rate": 0.001, "loss": 1.812, "step": 825000 }, { "epoch": 48.25407966309879, "eval_ag_news_accuracy": 0.24140625, "eval_ag_news_bleu_score": 8.73604151038144, "eval_ag_news_bleu_score_sem": 0.62075151111198, "eval_ag_news_emb_cos_sim": 0.7261211276054382, "eval_ag_news_emb_cos_sim_sem": 0.014194557443261147, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6917574405670166, "eval_ag_news_n_ngrams_match_1": 14.9375, "eval_ag_news_n_ngrams_match_2": 5.0546875, "eval_ag_news_n_ngrams_match_3": 2.15625, "eval_ag_news_num_pred_words": 43.1171875, "eval_ag_news_num_true_words": 44.453125, "eval_ag_news_perplexity": 14.757588728766056, "eval_ag_news_pred_num_tokens": 65.390625, "eval_ag_news_rouge_score": 0.32505663434180126, "eval_ag_news_runtime": 37.8952, "eval_ag_news_samples_per_second": 13.194, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.35803184081541467, "eval_ag_news_token_set_f1_sem": 0.01084853156085969, "eval_ag_news_token_set_precision": 0.3368446842597889, "eval_ag_news_token_set_recall": 0.3908239241072775, "eval_ag_news_true_num_tokens": 62.0859375, "step": 825000 }, { "epoch": 48.25407966309879, "eval_anthropic_toxic_prompts_accuracy": 0.102578125, "eval_anthropic_toxic_prompts_bleu_score": 41.564677311858695, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.6178739988699995, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9015083312988281, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008918950334191322, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.140625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030847556262404395, "eval_anthropic_toxic_prompts_loss": 1.277453899383545, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.15625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.453125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.3984375, "eval_anthropic_toxic_prompts_num_pred_words": 14.5390625, "eval_anthropic_toxic_prompts_num_true_words": 14.78125, "eval_anthropic_toxic_prompts_perplexity": 3.58749396689845, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.2421875, "eval_anthropic_toxic_prompts_rouge_score": 0.6909024763625702, "eval_anthropic_toxic_prompts_runtime": 31.2353, "eval_anthropic_toxic_prompts_samples_per_second": 16.008, "eval_anthropic_toxic_prompts_steps_per_second": 0.032, "eval_anthropic_toxic_prompts_token_set_f1": 0.7048651743479208, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017793523716862568, "eval_anthropic_toxic_prompts_token_set_precision": 0.6995266628166126, "eval_anthropic_toxic_prompts_token_set_recall": 0.7138798436690252, "eval_anthropic_toxic_prompts_true_num_tokens": 17.890625, "step": 825000 }, { "epoch": 48.25407966309879, "eval_arxiv_accuracy": 0.373921875, "eval_arxiv_bleu_score": 1.7587919600640909, "eval_arxiv_bleu_score_sem": 0.14823857333633716, "eval_arxiv_emb_cos_sim": 0.4851681590080261, "eval_arxiv_emb_cos_sim_sem": 0.018860984593629837, "eval_arxiv_emb_top1_equal": 0.859375, "eval_arxiv_emb_top1_equal_sem": 0.03084755875170231, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4328300952911377, "eval_arxiv_n_ngrams_match_1": 13.921875, "eval_arxiv_n_ngrams_match_2": 2.5703125, "eval_arxiv_n_ngrams_match_3": 0.5, "eval_arxiv_num_pred_words": 54.734375, "eval_arxiv_num_true_words": 86.1171875, "eval_arxiv_perplexity": 30.96415035995971, "eval_arxiv_pred_num_tokens": 125.578125, "eval_arxiv_rouge_score": 0.1836262755108314, "eval_arxiv_runtime": 32.0437, "eval_arxiv_samples_per_second": 15.604, "eval_arxiv_steps_per_second": 0.031, "eval_arxiv_token_set_f1": 0.1849863641209683, "eval_arxiv_token_set_f1_sem": 0.008493926558151042, "eval_arxiv_token_set_precision": 0.12638103473463955, "eval_arxiv_token_set_recall": 0.41596412502734065, "eval_arxiv_true_num_tokens": 125.4609375, "step": 825000 }, { "epoch": 48.25407966309879, "eval_python_code_alpaca_accuracy": 0.12803125, "eval_python_code_alpaca_bleu_score": 26.64191162993204, "eval_python_code_alpaca_bleu_score_sem": 1.661117779938506, "eval_python_code_alpaca_emb_cos_sim": 0.8616484999656677, "eval_python_code_alpaca_emb_cos_sim_sem": 0.011697625741362572, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4821032285690308, "eval_python_code_alpaca_n_ngrams_match_1": 10.78125, "eval_python_code_alpaca_n_ngrams_match_2": 5.640625, "eval_python_code_alpaca_n_ngrams_match_3": 3.109375, "eval_python_code_alpaca_num_pred_words": 18.1015625, "eval_python_code_alpaca_num_true_words": 19.9296875, "eval_python_code_alpaca_perplexity": 4.402194772844753, "eval_python_code_alpaca_pred_num_tokens": 24.515625, "eval_python_code_alpaca_rouge_score": 0.5987067978862782, "eval_python_code_alpaca_runtime": 32.0435, "eval_python_code_alpaca_samples_per_second": 15.604, "eval_python_code_alpaca_steps_per_second": 0.031, "eval_python_code_alpaca_token_set_f1": 0.6191243718572609, "eval_python_code_alpaca_token_set_f1_sem": 0.013882560133235898, "eval_python_code_alpaca_token_set_precision": 0.5998934707340406, "eval_python_code_alpaca_token_set_recall": 0.6472501694233185, "eval_python_code_alpaca_true_num_tokens": 25.75, "step": 825000 }, { "epoch": 48.25407966309879, "eval_wikibio_accuracy": 0.37034375, "eval_wikibio_bleu_score": 8.424089492424905, "eval_wikibio_bleu_score_sem": 0.7196242134697559, "eval_wikibio_emb_cos_sim": 0.6310256123542786, "eval_wikibio_emb_cos_sim_sem": 0.020894408226013184, "eval_wikibio_emb_top1_equal": 0.9140625, "eval_wikibio_emb_top1_equal_sem": 0.024870097637176514, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.721205472946167, "eval_wikibio_n_ngrams_match_1": 17.0625, "eval_wikibio_n_ngrams_match_2": 5.984375, "eval_wikibio_n_ngrams_match_3": 2.625, "eval_wikibio_num_pred_words": 55.1171875, "eval_wikibio_num_true_words": 55.1328125, "eval_wikibio_perplexity": 15.198632746924314, "eval_wikibio_pred_num_tokens": 106.546875, "eval_wikibio_rouge_score": 0.31341020416334686, "eval_wikibio_runtime": 32.7942, "eval_wikibio_samples_per_second": 15.247, "eval_wikibio_steps_per_second": 0.03, "eval_wikibio_token_set_f1": 0.3351063846779422, "eval_wikibio_token_set_f1_sem": 0.011119758352527582, "eval_wikibio_token_set_precision": 0.3032774586859039, "eval_wikibio_token_set_recall": 0.4114339997468509, "eval_wikibio_true_num_tokens": 104.1171875, "step": 825000 }, { "epoch": 48.25407966309879, "eval_msmarco_accuracy": 0.393625, "eval_msmarco_bleu_score": 15.479771489890783, "eval_msmarco_bleu_score_sem": 1.1309457862692491, "eval_msmarco_emb_cos_sim": 0.785022497177124, "eval_msmarco_emb_cos_sim_sem": 0.016671592369675636, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.711525321006775, "eval_msmarco_n_ngrams_match_1": 27.203125, "eval_msmarco_n_ngrams_match_2": 11.734375, "eval_msmarco_n_ngrams_match_3": 6.3984375, "eval_msmarco_num_pred_words": 60.5234375, "eval_msmarco_num_true_words": 62.6484375, "eval_msmarco_perplexity": 5.537401353835359, "eval_msmarco_pred_num_tokens": 84.734375, "eval_msmarco_rouge_score": 0.42401749143795403, "eval_msmarco_runtime": 27.0156, "eval_msmarco_samples_per_second": 18.508, "eval_msmarco_steps_per_second": 0.037, "eval_msmarco_token_set_f1": 0.44715454507236435, "eval_msmarco_token_set_f1_sem": 0.013169519861005946, "eval_msmarco_token_set_precision": 0.4094691873291362, "eval_msmarco_token_set_recall": 0.5160645280135974, "eval_msmarco_true_num_tokens": 81.0625, "step": 825000 }, { "epoch": 48.25992864245189, "grad_norm": 0.22060361504554749, "learning_rate": 0.001, "loss": 1.8104, "step": 825100 }, { "epoch": 48.26577762180499, "grad_norm": 0.1489727646112442, "learning_rate": 0.001, "loss": 1.8098, "step": 825200 }, { "epoch": 48.2716266011581, "grad_norm": 0.15106835961341858, "learning_rate": 0.001, "loss": 1.8084, "step": 825300 }, { "epoch": 48.2774755805112, "grad_norm": 0.1717412769794464, "learning_rate": 0.001, "loss": 1.8067, "step": 825400 }, { "epoch": 48.283324559864305, "grad_norm": 0.1605033129453659, "learning_rate": 0.001, "loss": 1.8141, "step": 825500 }, { "epoch": 48.28917353921741, "grad_norm": 0.1786879450082779, "learning_rate": 0.001, "loss": 1.8151, "step": 825600 }, { "epoch": 48.29502251857051, "grad_norm": 0.181325301527977, "learning_rate": 0.001, "loss": 1.8138, "step": 825700 }, { "epoch": 48.30087149792361, "grad_norm": 0.16115742921829224, "learning_rate": 0.001, "loss": 1.8104, "step": 825800 }, { "epoch": 48.30672047727671, "grad_norm": 0.15389393270015717, "learning_rate": 0.001, "loss": 1.8058, "step": 825900 }, { "epoch": 48.31256945662982, "grad_norm": 0.17175161838531494, "learning_rate": 0.001, "loss": 1.81, "step": 826000 }, { "epoch": 48.31841843598292, "grad_norm": 0.1967678964138031, "learning_rate": 0.001, "loss": 1.8134, "step": 826100 }, { "epoch": 48.324267415336024, "grad_norm": 0.1692403256893158, "learning_rate": 0.001, "loss": 1.8122, "step": 826200 }, { "epoch": 48.33011639468913, "grad_norm": 0.1879119873046875, "learning_rate": 0.001, "loss": 1.8134, "step": 826300 }, { "epoch": 48.33596537404223, "grad_norm": 0.2072838842868805, "learning_rate": 0.001, "loss": 1.8172, "step": 826400 }, { "epoch": 48.341814353395336, "grad_norm": 0.19173522293567657, "learning_rate": 0.001, "loss": 1.8107, "step": 826500 }, { "epoch": 48.34766333274843, "grad_norm": 0.18492373824119568, "learning_rate": 0.001, "loss": 1.8126, "step": 826600 }, { "epoch": 48.353512312101536, "grad_norm": 0.19680717587471008, "learning_rate": 0.001, "loss": 1.8103, "step": 826700 }, { "epoch": 48.35936129145464, "grad_norm": 0.1857667863368988, "learning_rate": 0.001, "loss": 1.8124, "step": 826800 }, { "epoch": 48.365210270807744, "grad_norm": 0.17717361450195312, "learning_rate": 0.001, "loss": 1.8091, "step": 826900 }, { "epoch": 48.37105925016085, "grad_norm": 0.2132592797279358, "learning_rate": 0.001, "loss": 1.8102, "step": 827000 }, { "epoch": 48.37690822951395, "grad_norm": 0.16376666724681854, "learning_rate": 0.001, "loss": 1.8129, "step": 827100 }, { "epoch": 48.382757208867055, "grad_norm": 0.22257393598556519, "learning_rate": 0.001, "loss": 1.8109, "step": 827200 }, { "epoch": 48.38860618822016, "grad_norm": 0.23614820837974548, "learning_rate": 0.001, "loss": 1.8139, "step": 827300 }, { "epoch": 48.394455167573255, "grad_norm": 0.18515644967556, "learning_rate": 0.001, "loss": 1.8144, "step": 827400 }, { "epoch": 48.40030414692636, "grad_norm": 0.17436175048351288, "learning_rate": 0.001, "loss": 1.8064, "step": 827500 }, { "epoch": 48.40615312627946, "grad_norm": 0.19733963906764984, "learning_rate": 0.001, "loss": 1.8128, "step": 827600 }, { "epoch": 48.41200210563257, "grad_norm": 0.14531448483467102, "learning_rate": 0.001, "loss": 1.8136, "step": 827700 }, { "epoch": 48.41785108498567, "grad_norm": 0.17687112092971802, "learning_rate": 0.001, "loss": 1.8175, "step": 827800 }, { "epoch": 48.423700064338774, "grad_norm": 0.21461938321590424, "learning_rate": 0.001, "loss": 1.8154, "step": 827900 }, { "epoch": 48.42954904369188, "grad_norm": 0.22170697152614594, "learning_rate": 0.001, "loss": 1.8158, "step": 828000 }, { "epoch": 48.43539802304498, "grad_norm": 0.176935076713562, "learning_rate": 0.001, "loss": 1.8146, "step": 828100 }, { "epoch": 48.44124700239808, "grad_norm": 0.136895552277565, "learning_rate": 0.001, "loss": 1.8089, "step": 828200 }, { "epoch": 48.44709598175118, "grad_norm": 0.20701614022254944, "learning_rate": 0.001, "loss": 1.8105, "step": 828300 }, { "epoch": 48.452944961104286, "grad_norm": 0.1953405886888504, "learning_rate": 0.001, "loss": 1.8091, "step": 828400 }, { "epoch": 48.45879394045739, "grad_norm": 0.15021443367004395, "learning_rate": 0.001, "loss": 1.8072, "step": 828500 }, { "epoch": 48.464642919810494, "grad_norm": 0.19697268307209015, "learning_rate": 0.001, "loss": 1.8106, "step": 828600 }, { "epoch": 48.4704918991636, "grad_norm": 0.16993831098079681, "learning_rate": 0.001, "loss": 1.8136, "step": 828700 }, { "epoch": 48.4763408785167, "grad_norm": 0.20263715088367462, "learning_rate": 0.001, "loss": 1.8129, "step": 828800 }, { "epoch": 48.4821898578698, "grad_norm": 0.23691874742507935, "learning_rate": 0.001, "loss": 1.8082, "step": 828900 }, { "epoch": 48.4880388372229, "grad_norm": 0.16485758125782013, "learning_rate": 0.001, "loss": 1.8172, "step": 829000 }, { "epoch": 48.493887816576006, "grad_norm": 0.17540229856967926, "learning_rate": 0.001, "loss": 1.8093, "step": 829100 }, { "epoch": 48.49973679592911, "grad_norm": 0.17268279194831848, "learning_rate": 0.001, "loss": 1.8119, "step": 829200 }, { "epoch": 48.50558577528221, "grad_norm": 0.18195849657058716, "learning_rate": 0.001, "loss": 1.814, "step": 829300 }, { "epoch": 48.51143475463532, "grad_norm": 0.17379137873649597, "learning_rate": 0.001, "loss": 1.8167, "step": 829400 }, { "epoch": 48.51728373398842, "grad_norm": 0.20649655163288116, "learning_rate": 0.001, "loss": 1.8213, "step": 829500 }, { "epoch": 48.523132713341525, "grad_norm": 0.23547038435935974, "learning_rate": 0.001, "loss": 1.8128, "step": 829600 }, { "epoch": 48.52898169269462, "grad_norm": 0.1915908306837082, "learning_rate": 0.001, "loss": 1.8179, "step": 829700 }, { "epoch": 48.534830672047725, "grad_norm": 0.2475721389055252, "learning_rate": 0.001, "loss": 1.813, "step": 829800 }, { "epoch": 48.54067965140083, "grad_norm": 0.21132202446460724, "learning_rate": 0.001, "loss": 1.811, "step": 829900 }, { "epoch": 48.54652863075393, "grad_norm": 0.17668136954307556, "learning_rate": 0.001, "loss": 1.8151, "step": 830000 }, { "epoch": 48.55237761010704, "grad_norm": 0.15249426662921906, "learning_rate": 0.001, "loss": 1.821, "step": 830100 }, { "epoch": 48.55822658946014, "grad_norm": 0.18308162689208984, "learning_rate": 0.001, "loss": 1.8174, "step": 830200 }, { "epoch": 48.564075568813244, "grad_norm": 0.17969442903995514, "learning_rate": 0.001, "loss": 1.8123, "step": 830300 }, { "epoch": 48.56992454816635, "grad_norm": 0.19403797388076782, "learning_rate": 0.001, "loss": 1.8118, "step": 830400 }, { "epoch": 48.575773527519445, "grad_norm": 0.18779060244560242, "learning_rate": 0.001, "loss": 1.813, "step": 830500 }, { "epoch": 48.58162250687255, "grad_norm": 0.26673048734664917, "learning_rate": 0.001, "loss": 1.8138, "step": 830600 }, { "epoch": 48.58747148622565, "grad_norm": 0.1797509640455246, "learning_rate": 0.001, "loss": 1.8165, "step": 830700 }, { "epoch": 48.593320465578756, "grad_norm": 0.15724138915538788, "learning_rate": 0.001, "loss": 1.8147, "step": 830800 }, { "epoch": 48.59916944493186, "grad_norm": 0.1718803346157074, "learning_rate": 0.001, "loss": 1.8142, "step": 830900 }, { "epoch": 48.605018424284964, "grad_norm": 0.19226454198360443, "learning_rate": 0.001, "loss": 1.8157, "step": 831000 }, { "epoch": 48.61086740363807, "grad_norm": 0.1804172843694687, "learning_rate": 0.001, "loss": 1.821, "step": 831100 }, { "epoch": 48.61671638299117, "grad_norm": 0.14756172895431519, "learning_rate": 0.001, "loss": 1.8143, "step": 831200 }, { "epoch": 48.62256536234427, "grad_norm": 0.1644430160522461, "learning_rate": 0.001, "loss": 1.8135, "step": 831300 }, { "epoch": 48.62841434169737, "grad_norm": 0.18034224212169647, "learning_rate": 0.001, "loss": 1.8202, "step": 831400 }, { "epoch": 48.634263321050476, "grad_norm": 0.17102208733558655, "learning_rate": 0.001, "loss": 1.8138, "step": 831500 }, { "epoch": 48.64011230040358, "grad_norm": 0.17156806588172913, "learning_rate": 0.001, "loss": 1.8124, "step": 831600 }, { "epoch": 48.64596127975668, "grad_norm": 0.1980394870042801, "learning_rate": 0.001, "loss": 1.8089, "step": 831700 }, { "epoch": 48.65181025910979, "grad_norm": 0.18468590080738068, "learning_rate": 0.001, "loss": 1.8172, "step": 831800 }, { "epoch": 48.65765923846289, "grad_norm": 0.1909821480512619, "learning_rate": 0.001, "loss": 1.8181, "step": 831900 }, { "epoch": 48.66350821781599, "grad_norm": 0.16107217967510223, "learning_rate": 0.001, "loss": 1.8125, "step": 832000 }, { "epoch": 48.66935719716909, "grad_norm": 0.19437824189662933, "learning_rate": 0.001, "loss": 1.8099, "step": 832100 }, { "epoch": 48.675206176522195, "grad_norm": 0.3132174611091614, "learning_rate": 0.001, "loss": 1.8212, "step": 832200 }, { "epoch": 48.6810551558753, "grad_norm": 0.14809033274650574, "learning_rate": 0.001, "loss": 1.8184, "step": 832300 }, { "epoch": 48.6869041352284, "grad_norm": 0.14904950559139252, "learning_rate": 0.001, "loss": 1.817, "step": 832400 }, { "epoch": 48.69275311458151, "grad_norm": 0.15669775009155273, "learning_rate": 0.001, "loss": 1.8114, "step": 832500 }, { "epoch": 48.69860209393461, "grad_norm": 0.1740139275789261, "learning_rate": 0.001, "loss": 1.8111, "step": 832600 }, { "epoch": 48.704451073287714, "grad_norm": 0.1398666799068451, "learning_rate": 0.001, "loss": 1.8162, "step": 832700 }, { "epoch": 48.71030005264081, "grad_norm": 0.186379075050354, "learning_rate": 0.001, "loss": 1.811, "step": 832800 }, { "epoch": 48.716149031993915, "grad_norm": 0.1855565905570984, "learning_rate": 0.001, "loss": 1.8174, "step": 832900 }, { "epoch": 48.72199801134702, "grad_norm": 0.27136504650115967, "learning_rate": 0.001, "loss": 1.815, "step": 833000 }, { "epoch": 48.72784699070012, "grad_norm": 0.24859942495822906, "learning_rate": 0.001, "loss": 1.8168, "step": 833100 }, { "epoch": 48.733695970053226, "grad_norm": 0.17584334313869476, "learning_rate": 0.001, "loss": 1.8112, "step": 833200 }, { "epoch": 48.73954494940633, "grad_norm": 0.32169389724731445, "learning_rate": 0.001, "loss": 1.8178, "step": 833300 }, { "epoch": 48.745393928759434, "grad_norm": 0.16574977338314056, "learning_rate": 0.001, "loss": 1.8129, "step": 833400 }, { "epoch": 48.75124290811254, "grad_norm": 0.16043250262737274, "learning_rate": 0.001, "loss": 1.8134, "step": 833500 }, { "epoch": 48.757091887465634, "grad_norm": 0.193297877907753, "learning_rate": 0.001, "loss": 1.8168, "step": 833600 }, { "epoch": 48.76294086681874, "grad_norm": 0.21557734906673431, "learning_rate": 0.001, "loss": 1.819, "step": 833700 }, { "epoch": 48.76878984617184, "grad_norm": 0.2014492303133011, "learning_rate": 0.001, "loss": 1.8167, "step": 833800 }, { "epoch": 48.774638825524946, "grad_norm": 0.18985797464847565, "learning_rate": 0.001, "loss": 1.811, "step": 833900 }, { "epoch": 48.78048780487805, "grad_norm": 0.1806378960609436, "learning_rate": 0.001, "loss": 1.8114, "step": 834000 }, { "epoch": 48.78633678423115, "grad_norm": 0.18109577894210815, "learning_rate": 0.001, "loss": 1.8131, "step": 834100 }, { "epoch": 48.79218576358426, "grad_norm": 0.18063676357269287, "learning_rate": 0.001, "loss": 1.813, "step": 834200 }, { "epoch": 48.79803474293736, "grad_norm": 0.21134088933467865, "learning_rate": 0.001, "loss": 1.8198, "step": 834300 }, { "epoch": 48.80388372229046, "grad_norm": 0.16455121338367462, "learning_rate": 0.001, "loss": 1.8156, "step": 834400 }, { "epoch": 48.80973270164356, "grad_norm": 0.16082240641117096, "learning_rate": 0.001, "loss": 1.8129, "step": 834500 }, { "epoch": 48.815581680996665, "grad_norm": 0.16492323577404022, "learning_rate": 0.001, "loss": 1.8187, "step": 834600 }, { "epoch": 48.82143066034977, "grad_norm": 0.17578494548797607, "learning_rate": 0.001, "loss": 1.8192, "step": 834700 }, { "epoch": 48.82727963970287, "grad_norm": 0.16954605281352997, "learning_rate": 0.001, "loss": 1.8195, "step": 834800 }, { "epoch": 48.83312861905598, "grad_norm": 0.1733979731798172, "learning_rate": 0.001, "loss": 1.8211, "step": 834900 }, { "epoch": 48.83897759840908, "grad_norm": 0.19680848717689514, "learning_rate": 0.001, "loss": 1.8186, "step": 835000 }, { "epoch": 48.84482657776218, "grad_norm": 0.1524832844734192, "learning_rate": 0.001, "loss": 1.815, "step": 835100 }, { "epoch": 48.85067555711528, "grad_norm": 0.16273090243339539, "learning_rate": 0.001, "loss": 1.8129, "step": 835200 }, { "epoch": 48.856524536468385, "grad_norm": 0.18339920043945312, "learning_rate": 0.001, "loss": 1.8235, "step": 835300 }, { "epoch": 48.86237351582149, "grad_norm": 0.18876084685325623, "learning_rate": 0.001, "loss": 1.8204, "step": 835400 }, { "epoch": 48.86822249517459, "grad_norm": 0.14303486049175262, "learning_rate": 0.001, "loss": 1.8127, "step": 835500 }, { "epoch": 48.874071474527696, "grad_norm": 0.14112147688865662, "learning_rate": 0.001, "loss": 1.8175, "step": 835600 }, { "epoch": 48.8799204538808, "grad_norm": 0.17427843809127808, "learning_rate": 0.001, "loss": 1.8169, "step": 835700 }, { "epoch": 48.885769433233904, "grad_norm": 0.18354614078998566, "learning_rate": 0.001, "loss": 1.8202, "step": 835800 }, { "epoch": 48.891618412587, "grad_norm": 0.1895984262228012, "learning_rate": 0.001, "loss": 1.8155, "step": 835900 }, { "epoch": 48.897467391940104, "grad_norm": 0.14340005815029144, "learning_rate": 0.001, "loss": 1.8231, "step": 836000 }, { "epoch": 48.90331637129321, "grad_norm": 0.15354420244693756, "learning_rate": 0.001, "loss": 1.8193, "step": 836100 }, { "epoch": 48.90916535064631, "grad_norm": 0.16076329350471497, "learning_rate": 0.001, "loss": 1.8143, "step": 836200 }, { "epoch": 48.915014329999416, "grad_norm": 0.22416509687900543, "learning_rate": 0.001, "loss": 1.8156, "step": 836300 }, { "epoch": 48.92086330935252, "grad_norm": 0.21494489908218384, "learning_rate": 0.001, "loss": 1.8177, "step": 836400 }, { "epoch": 48.92671228870562, "grad_norm": 0.16571775078773499, "learning_rate": 0.001, "loss": 1.8186, "step": 836500 }, { "epoch": 48.93256126805873, "grad_norm": 0.16250832378864288, "learning_rate": 0.001, "loss": 1.8193, "step": 836600 }, { "epoch": 48.938410247411824, "grad_norm": 0.1790037453174591, "learning_rate": 0.001, "loss": 1.8161, "step": 836700 }, { "epoch": 48.94425922676493, "grad_norm": 0.23659324645996094, "learning_rate": 0.001, "loss": 1.8124, "step": 836800 }, { "epoch": 48.95010820611803, "grad_norm": 0.20387518405914307, "learning_rate": 0.001, "loss": 1.8202, "step": 836900 }, { "epoch": 48.955957185471135, "grad_norm": 0.2201024293899536, "learning_rate": 0.001, "loss": 1.8187, "step": 837000 }, { "epoch": 48.96180616482424, "grad_norm": 0.16101746261119843, "learning_rate": 0.001, "loss": 1.8202, "step": 837100 }, { "epoch": 48.96765514417734, "grad_norm": 0.1987527757883072, "learning_rate": 0.001, "loss": 1.8138, "step": 837200 }, { "epoch": 48.97350412353045, "grad_norm": 0.2107602059841156, "learning_rate": 0.001, "loss": 1.8136, "step": 837300 }, { "epoch": 48.97935310288355, "grad_norm": 0.1679507941007614, "learning_rate": 0.001, "loss": 1.8184, "step": 837400 }, { "epoch": 48.98520208223665, "grad_norm": 0.17406655848026276, "learning_rate": 0.001, "loss": 1.8147, "step": 837500 }, { "epoch": 48.99105106158975, "grad_norm": 0.20360784232616425, "learning_rate": 0.001, "loss": 1.8209, "step": 837600 }, { "epoch": 48.996900040942855, "grad_norm": 0.18049581348896027, "learning_rate": 0.001, "loss": 1.8193, "step": 837700 }, { "epoch": 49.00274902029596, "grad_norm": 0.19253715872764587, "learning_rate": 0.001, "loss": 1.8038, "step": 837800 }, { "epoch": 49.00859799964906, "grad_norm": 0.18790936470031738, "learning_rate": 0.001, "loss": 1.8019, "step": 837900 }, { "epoch": 49.014446979002166, "grad_norm": 0.14165721833705902, "learning_rate": 0.001, "loss": 1.8011, "step": 838000 }, { "epoch": 49.02029595835527, "grad_norm": 0.18738731741905212, "learning_rate": 0.001, "loss": 1.7991, "step": 838100 }, { "epoch": 49.02614493770837, "grad_norm": 0.17697826027870178, "learning_rate": 0.001, "loss": 1.7979, "step": 838200 }, { "epoch": 49.03199391706147, "grad_norm": 0.1955273449420929, "learning_rate": 0.001, "loss": 1.7984, "step": 838300 }, { "epoch": 49.037842896414574, "grad_norm": 0.1880578249692917, "learning_rate": 0.001, "loss": 1.8005, "step": 838400 }, { "epoch": 49.04369187576768, "grad_norm": 0.20348984003067017, "learning_rate": 0.001, "loss": 1.801, "step": 838500 }, { "epoch": 49.04954085512078, "grad_norm": 0.21538065373897552, "learning_rate": 0.001, "loss": 1.7961, "step": 838600 }, { "epoch": 49.055389834473885, "grad_norm": 0.2126924991607666, "learning_rate": 0.001, "loss": 1.8031, "step": 838700 }, { "epoch": 49.06123881382699, "grad_norm": 0.21044249832630157, "learning_rate": 0.001, "loss": 1.8052, "step": 838800 }, { "epoch": 49.06708779318009, "grad_norm": 0.21916243433952332, "learning_rate": 0.001, "loss": 1.8066, "step": 838900 }, { "epoch": 49.07293677253319, "grad_norm": 0.17511039972305298, "learning_rate": 0.001, "loss": 1.8044, "step": 839000 }, { "epoch": 49.078785751886294, "grad_norm": 0.1876412332057953, "learning_rate": 0.001, "loss": 1.802, "step": 839100 }, { "epoch": 49.0846347312394, "grad_norm": 0.15734465420246124, "learning_rate": 0.001, "loss": 1.8056, "step": 839200 }, { "epoch": 49.0904837105925, "grad_norm": 0.20230840146541595, "learning_rate": 0.001, "loss": 1.8055, "step": 839300 }, { "epoch": 49.096332689945605, "grad_norm": 0.21222738921642303, "learning_rate": 0.001, "loss": 1.798, "step": 839400 }, { "epoch": 49.10218166929871, "grad_norm": 0.21754522621631622, "learning_rate": 0.001, "loss": 1.8127, "step": 839500 }, { "epoch": 49.10803064865181, "grad_norm": 0.20226801931858063, "learning_rate": 0.001, "loss": 1.8113, "step": 839600 }, { "epoch": 49.113879628004916, "grad_norm": 0.14622069895267487, "learning_rate": 0.001, "loss": 1.8023, "step": 839700 }, { "epoch": 49.11972860735801, "grad_norm": 0.15914598107337952, "learning_rate": 0.001, "loss": 1.8095, "step": 839800 }, { "epoch": 49.12557758671112, "grad_norm": 0.2767196297645569, "learning_rate": 0.001, "loss": 1.805, "step": 839900 }, { "epoch": 49.13142656606422, "grad_norm": 0.18035826086997986, "learning_rate": 0.001, "loss": 1.8029, "step": 840000 }, { "epoch": 49.137275545417324, "grad_norm": 0.1754802167415619, "learning_rate": 0.001, "loss": 1.8037, "step": 840100 }, { "epoch": 49.14312452477043, "grad_norm": 0.18962711095809937, "learning_rate": 0.001, "loss": 1.8107, "step": 840200 }, { "epoch": 49.14897350412353, "grad_norm": 0.24835216999053955, "learning_rate": 0.001, "loss": 1.8093, "step": 840300 }, { "epoch": 49.154822483476636, "grad_norm": 0.23128513991832733, "learning_rate": 0.001, "loss": 1.8145, "step": 840400 }, { "epoch": 49.16067146282974, "grad_norm": 0.16374970972537994, "learning_rate": 0.001, "loss": 1.8064, "step": 840500 }, { "epoch": 49.166520442182836, "grad_norm": 0.20154549181461334, "learning_rate": 0.001, "loss": 1.8041, "step": 840600 }, { "epoch": 49.17236942153594, "grad_norm": 0.20653954148292542, "learning_rate": 0.001, "loss": 1.8059, "step": 840700 }, { "epoch": 49.178218400889044, "grad_norm": 0.19045878946781158, "learning_rate": 0.001, "loss": 1.8082, "step": 840800 }, { "epoch": 49.18406738024215, "grad_norm": 0.19608359038829803, "learning_rate": 0.001, "loss": 1.81, "step": 840900 }, { "epoch": 49.18991635959525, "grad_norm": 0.1417297124862671, "learning_rate": 0.001, "loss": 1.8099, "step": 841000 }, { "epoch": 49.195765338948355, "grad_norm": 0.23512205481529236, "learning_rate": 0.001, "loss": 1.8039, "step": 841100 }, { "epoch": 49.20161431830146, "grad_norm": 0.1776459664106369, "learning_rate": 0.001, "loss": 1.8108, "step": 841200 }, { "epoch": 49.207463297654556, "grad_norm": 0.20987851917743683, "learning_rate": 0.001, "loss": 1.8003, "step": 841300 }, { "epoch": 49.21331227700766, "grad_norm": 0.2111242264509201, "learning_rate": 0.001, "loss": 1.8037, "step": 841400 }, { "epoch": 49.21916125636076, "grad_norm": 0.23690265417099, "learning_rate": 0.001, "loss": 1.8081, "step": 841500 }, { "epoch": 49.22501023571387, "grad_norm": 0.20073682069778442, "learning_rate": 0.001, "loss": 1.8155, "step": 841600 }, { "epoch": 49.23085921506697, "grad_norm": 0.21720920503139496, "learning_rate": 0.001, "loss": 1.8121, "step": 841700 }, { "epoch": 49.236708194420075, "grad_norm": 0.2749355733394623, "learning_rate": 0.001, "loss": 1.8069, "step": 841800 }, { "epoch": 49.24255717377318, "grad_norm": 0.2119852751493454, "learning_rate": 0.001, "loss": 1.8059, "step": 841900 }, { "epoch": 49.24840615312628, "grad_norm": 0.20359578728675842, "learning_rate": 0.001, "loss": 1.8067, "step": 842000 }, { "epoch": 49.25425513247938, "grad_norm": 0.16478034853935242, "learning_rate": 0.001, "loss": 1.8097, "step": 842100 }, { "epoch": 49.26010411183248, "grad_norm": 0.1637977957725525, "learning_rate": 0.001, "loss": 1.8061, "step": 842200 }, { "epoch": 49.26595309118559, "grad_norm": 0.2450864464044571, "learning_rate": 0.001, "loss": 1.8085, "step": 842300 }, { "epoch": 49.27180207053869, "grad_norm": 0.19898679852485657, "learning_rate": 0.001, "loss": 1.8044, "step": 842400 }, { "epoch": 49.277651049891794, "grad_norm": 0.15070347487926483, "learning_rate": 0.001, "loss": 1.811, "step": 842500 }, { "epoch": 49.2835000292449, "grad_norm": 0.2136269062757492, "learning_rate": 0.001, "loss": 1.8077, "step": 842600 }, { "epoch": 49.289349008598, "grad_norm": 0.20607079565525055, "learning_rate": 0.001, "loss": 1.8079, "step": 842700 }, { "epoch": 49.295197987951106, "grad_norm": 0.20708522200584412, "learning_rate": 0.001, "loss": 1.8133, "step": 842800 }, { "epoch": 49.3010469673042, "grad_norm": 0.23819977045059204, "learning_rate": 0.001, "loss": 1.8119, "step": 842900 }, { "epoch": 49.306895946657306, "grad_norm": 0.17005828022956848, "learning_rate": 0.001, "loss": 1.8084, "step": 843000 }, { "epoch": 49.31274492601041, "grad_norm": 0.31204718351364136, "learning_rate": 0.001, "loss": 1.8028, "step": 843100 }, { "epoch": 49.318593905363514, "grad_norm": 0.17722053825855255, "learning_rate": 0.001, "loss": 1.8122, "step": 843200 }, { "epoch": 49.32444288471662, "grad_norm": 0.18052895367145538, "learning_rate": 0.001, "loss": 1.8073, "step": 843300 }, { "epoch": 49.33029186406972, "grad_norm": 0.14713411033153534, "learning_rate": 0.001, "loss": 1.8106, "step": 843400 }, { "epoch": 49.336140843422825, "grad_norm": 0.20147204399108887, "learning_rate": 0.001, "loss": 1.8091, "step": 843500 }, { "epoch": 49.34198982277593, "grad_norm": 0.23016181588172913, "learning_rate": 0.001, "loss": 1.8134, "step": 843600 }, { "epoch": 49.347838802129026, "grad_norm": 0.22024506330490112, "learning_rate": 0.001, "loss": 1.8069, "step": 843700 }, { "epoch": 49.35368778148213, "grad_norm": 0.16473296284675598, "learning_rate": 0.001, "loss": 1.8123, "step": 843800 }, { "epoch": 49.35953676083523, "grad_norm": 0.2240588665008545, "learning_rate": 0.001, "loss": 1.8059, "step": 843900 }, { "epoch": 49.36538574018834, "grad_norm": 0.1657722443342209, "learning_rate": 0.001, "loss": 1.8062, "step": 844000 }, { "epoch": 49.37123471954144, "grad_norm": 0.17860302329063416, "learning_rate": 0.001, "loss": 1.8115, "step": 844100 }, { "epoch": 49.377083698894545, "grad_norm": 0.19175389409065247, "learning_rate": 0.001, "loss": 1.8045, "step": 844200 }, { "epoch": 49.38293267824765, "grad_norm": 0.1883769929409027, "learning_rate": 0.001, "loss": 1.8098, "step": 844300 }, { "epoch": 49.388781657600745, "grad_norm": 0.19618339836597443, "learning_rate": 0.001, "loss": 1.8079, "step": 844400 }, { "epoch": 49.39463063695385, "grad_norm": 0.23128636181354523, "learning_rate": 0.001, "loss": 1.8134, "step": 844500 }, { "epoch": 49.40047961630695, "grad_norm": 0.23832117021083832, "learning_rate": 0.001, "loss": 1.8115, "step": 844600 }, { "epoch": 49.40632859566006, "grad_norm": 0.16074606776237488, "learning_rate": 0.001, "loss": 1.8093, "step": 844700 }, { "epoch": 49.41217757501316, "grad_norm": 0.2264428287744522, "learning_rate": 0.001, "loss": 1.8122, "step": 844800 }, { "epoch": 49.418026554366264, "grad_norm": 0.270222932100296, "learning_rate": 0.001, "loss": 1.8085, "step": 844900 }, { "epoch": 49.42387553371937, "grad_norm": 0.16562116146087646, "learning_rate": 0.001, "loss": 1.8086, "step": 845000 }, { "epoch": 49.42972451307247, "grad_norm": 0.2564798593521118, "learning_rate": 0.001, "loss": 1.8144, "step": 845100 }, { "epoch": 49.43557349242557, "grad_norm": 0.18018575012683868, "learning_rate": 0.001, "loss": 1.8077, "step": 845200 }, { "epoch": 49.44142247177867, "grad_norm": 0.2320864200592041, "learning_rate": 0.001, "loss": 1.8168, "step": 845300 }, { "epoch": 49.447271451131776, "grad_norm": 0.17387329041957855, "learning_rate": 0.001, "loss": 1.8044, "step": 845400 }, { "epoch": 49.45312043048488, "grad_norm": 0.1590123325586319, "learning_rate": 0.001, "loss": 1.81, "step": 845500 }, { "epoch": 49.458969409837984, "grad_norm": 0.22552385926246643, "learning_rate": 0.001, "loss": 1.8163, "step": 845600 }, { "epoch": 49.46481838919109, "grad_norm": 0.19823677837848663, "learning_rate": 0.001, "loss": 1.8147, "step": 845700 }, { "epoch": 49.47066736854419, "grad_norm": 0.16059353947639465, "learning_rate": 0.001, "loss": 1.813, "step": 845800 }, { "epoch": 49.476516347897295, "grad_norm": 0.1817408800125122, "learning_rate": 0.001, "loss": 1.8084, "step": 845900 }, { "epoch": 49.48236532725039, "grad_norm": 0.19036823511123657, "learning_rate": 0.001, "loss": 1.8115, "step": 846000 }, { "epoch": 49.488214306603496, "grad_norm": 0.1901111751794815, "learning_rate": 0.001, "loss": 1.8155, "step": 846100 }, { "epoch": 49.4940632859566, "grad_norm": 0.19733230769634247, "learning_rate": 0.001, "loss": 1.8107, "step": 846200 }, { "epoch": 49.4999122653097, "grad_norm": 0.17071622610092163, "learning_rate": 0.001, "loss": 1.8096, "step": 846300 }, { "epoch": 49.50576124466281, "grad_norm": 0.19506050646305084, "learning_rate": 0.001, "loss": 1.8084, "step": 846400 }, { "epoch": 49.51161022401591, "grad_norm": 0.19300593435764313, "learning_rate": 0.001, "loss": 1.805, "step": 846500 }, { "epoch": 49.517459203369015, "grad_norm": 0.18821914494037628, "learning_rate": 0.001, "loss": 1.8084, "step": 846600 }, { "epoch": 49.52330818272212, "grad_norm": 0.2534908354282379, "learning_rate": 0.001, "loss": 1.8145, "step": 846700 }, { "epoch": 49.529157162075215, "grad_norm": 0.23953543603420258, "learning_rate": 0.001, "loss": 1.8134, "step": 846800 }, { "epoch": 49.53500614142832, "grad_norm": 0.23003968596458435, "learning_rate": 0.001, "loss": 1.8098, "step": 846900 }, { "epoch": 49.54085512078142, "grad_norm": 0.22165246307849884, "learning_rate": 0.001, "loss": 1.8169, "step": 847000 }, { "epoch": 49.54670410013453, "grad_norm": 0.17255665361881256, "learning_rate": 0.001, "loss": 1.8091, "step": 847100 }, { "epoch": 49.55255307948763, "grad_norm": 0.16412541270256042, "learning_rate": 0.001, "loss": 1.8163, "step": 847200 }, { "epoch": 49.558402058840734, "grad_norm": 0.20298638939857483, "learning_rate": 0.001, "loss": 1.8141, "step": 847300 }, { "epoch": 49.56425103819384, "grad_norm": 0.1766754537820816, "learning_rate": 0.001, "loss": 1.8062, "step": 847400 }, { "epoch": 49.570100017546935, "grad_norm": 0.17997391521930695, "learning_rate": 0.001, "loss": 1.8055, "step": 847500 }, { "epoch": 49.57594899690004, "grad_norm": 0.1749015897512436, "learning_rate": 0.001, "loss": 1.8111, "step": 847600 }, { "epoch": 49.58179797625314, "grad_norm": 0.23756088316440582, "learning_rate": 0.001, "loss": 1.8159, "step": 847700 }, { "epoch": 49.587646955606246, "grad_norm": 0.19849376380443573, "learning_rate": 0.001, "loss": 1.8164, "step": 847800 }, { "epoch": 49.59349593495935, "grad_norm": 0.16366566717624664, "learning_rate": 0.001, "loss": 1.812, "step": 847900 }, { "epoch": 49.599344914312454, "grad_norm": 0.2238994687795639, "learning_rate": 0.001, "loss": 1.8132, "step": 848000 }, { "epoch": 49.60519389366556, "grad_norm": 0.19019301235675812, "learning_rate": 0.001, "loss": 1.8122, "step": 848100 }, { "epoch": 49.61104287301866, "grad_norm": 0.3410313129425049, "learning_rate": 0.001, "loss": 1.8155, "step": 848200 }, { "epoch": 49.61689185237176, "grad_norm": 0.16841447353363037, "learning_rate": 0.001, "loss": 1.8148, "step": 848300 }, { "epoch": 49.62274083172486, "grad_norm": 0.43639978766441345, "learning_rate": 0.001, "loss": 1.8069, "step": 848400 }, { "epoch": 49.628589811077966, "grad_norm": 0.22146865725517273, "learning_rate": 0.001, "loss": 1.8223, "step": 848500 }, { "epoch": 49.63443879043107, "grad_norm": 0.20509423315525055, "learning_rate": 0.001, "loss": 1.8151, "step": 848600 }, { "epoch": 49.64028776978417, "grad_norm": 0.19317694008350372, "learning_rate": 0.001, "loss": 1.8104, "step": 848700 }, { "epoch": 49.64613674913728, "grad_norm": 0.230735644698143, "learning_rate": 0.001, "loss": 1.8144, "step": 848800 }, { "epoch": 49.65198572849038, "grad_norm": 0.2397209107875824, "learning_rate": 0.001, "loss": 1.814, "step": 848900 }, { "epoch": 49.657834707843485, "grad_norm": 0.24895387887954712, "learning_rate": 0.001, "loss": 1.8062, "step": 849000 }, { "epoch": 49.66368368719658, "grad_norm": 0.2732861042022705, "learning_rate": 0.001, "loss": 1.814, "step": 849100 }, { "epoch": 49.669532666549685, "grad_norm": 0.16902382671833038, "learning_rate": 0.001, "loss": 1.8128, "step": 849200 }, { "epoch": 49.67538164590279, "grad_norm": 0.16364645957946777, "learning_rate": 0.001, "loss": 1.8097, "step": 849300 }, { "epoch": 49.68123062525589, "grad_norm": 0.17659898102283478, "learning_rate": 0.001, "loss": 1.8161, "step": 849400 }, { "epoch": 49.687079604608996, "grad_norm": 0.1976972371339798, "learning_rate": 0.001, "loss": 1.8061, "step": 849500 }, { "epoch": 49.6929285839621, "grad_norm": 0.1783188134431839, "learning_rate": 0.001, "loss": 1.8158, "step": 849600 }, { "epoch": 49.698777563315204, "grad_norm": 0.1988726109266281, "learning_rate": 0.001, "loss": 1.8209, "step": 849700 }, { "epoch": 49.70462654266831, "grad_norm": 0.18942482769489288, "learning_rate": 0.001, "loss": 1.8154, "step": 849800 }, { "epoch": 49.710475522021405, "grad_norm": 0.21914996206760406, "learning_rate": 0.001, "loss": 1.8127, "step": 849900 }, { "epoch": 49.71632450137451, "grad_norm": 0.174288809299469, "learning_rate": 0.001, "loss": 1.8119, "step": 850000 }, { "epoch": 49.71632450137451, "eval_ag_news_accuracy": 0.243171875, "eval_ag_news_bleu_score": 7.334882882308836, "eval_ag_news_bleu_score_sem": 0.5167738245878762, "eval_ag_news_emb_cos_sim": 0.7433121204376221, "eval_ag_news_emb_cos_sim_sem": 0.012547106482088566, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7021632194519043, "eval_ag_news_n_ngrams_match_1": 14.359375, "eval_ag_news_n_ngrams_match_2": 4.1484375, "eval_ag_news_n_ngrams_match_3": 1.5859375, "eval_ag_news_num_pred_words": 45.0625, "eval_ag_news_num_true_words": 44.9921875, "eval_ag_news_perplexity": 14.911954690086109, "eval_ag_news_pred_num_tokens": 66.921875, "eval_ag_news_rouge_score": 0.3051765523129902, "eval_ag_news_runtime": 38.1163, "eval_ag_news_samples_per_second": 13.118, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3406253018934142, "eval_ag_news_token_set_f1_sem": 0.009116284019834088, "eval_ag_news_token_set_precision": 0.3245942128038345, "eval_ag_news_token_set_recall": 0.3627008319169642, "eval_ag_news_true_num_tokens": 62.4921875, "step": 850000 }, { "epoch": 49.71632450137451, "eval_anthropic_toxic_prompts_accuracy": 0.101765625, "eval_anthropic_toxic_prompts_bleu_score": 44.40772979131684, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.808255404711852, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8965466022491455, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008827364072203636, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03347745561491821, "eval_anthropic_toxic_prompts_loss": 1.2305115461349487, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.6015625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.2890625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.421875, "eval_anthropic_toxic_prompts_num_pred_words": 13.46875, "eval_anthropic_toxic_prompts_num_true_words": 13.8359375, "eval_anthropic_toxic_prompts_perplexity": 3.4229801007448017, "eval_anthropic_toxic_prompts_pred_num_tokens": 17.09375, "eval_anthropic_toxic_prompts_rouge_score": 0.7005959265068897, "eval_anthropic_toxic_prompts_runtime": 30.1044, "eval_anthropic_toxic_prompts_samples_per_second": 16.609, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.715887182607692, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01863476426302981, "eval_anthropic_toxic_prompts_token_set_precision": 0.7069111574265516, "eval_anthropic_toxic_prompts_token_set_recall": 0.7296750377317245, "eval_anthropic_toxic_prompts_true_num_tokens": 16.8515625, "step": 850000 }, { "epoch": 49.71632450137451, "eval_arxiv_accuracy": 0.374765625, "eval_arxiv_bleu_score": 1.9890984276783918, "eval_arxiv_bleu_score_sem": 0.18846693001814724, "eval_arxiv_emb_cos_sim": 0.4920576810836792, "eval_arxiv_emb_cos_sim_sem": 0.01877976581454277, "eval_arxiv_emb_top1_equal": 0.8984375, "eval_arxiv_emb_top1_equal_sem": 0.026804566383361816, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.427292823791504, "eval_arxiv_n_ngrams_match_1": 14.9609375, "eval_arxiv_n_ngrams_match_2": 2.71875, "eval_arxiv_n_ngrams_match_3": 0.609375, "eval_arxiv_num_pred_words": 57.2265625, "eval_arxiv_num_true_words": 87.125, "eval_arxiv_perplexity": 30.793167279411698, "eval_arxiv_pred_num_tokens": 125.4453125, "eval_arxiv_rouge_score": 0.19346423279303976, "eval_arxiv_runtime": 31.078, "eval_arxiv_samples_per_second": 16.089, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.1914468334131839, "eval_arxiv_token_set_f1_sem": 0.008181156123768902, "eval_arxiv_token_set_precision": 0.13246314568384926, "eval_arxiv_token_set_recall": 0.4105090231013816, "eval_arxiv_true_num_tokens": 125.2109375, "step": 850000 }, { "epoch": 49.71632450137451, "eval_python_code_alpaca_accuracy": 0.129, "eval_python_code_alpaca_bleu_score": 30.453869529772653, "eval_python_code_alpaca_bleu_score_sem": 1.6057713690219741, "eval_python_code_alpaca_emb_cos_sim": 0.8873302340507507, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008223472163081169, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4802024364471436, "eval_python_code_alpaca_n_ngrams_match_1": 10.734375, "eval_python_code_alpaca_n_ngrams_match_2": 6.0078125, "eval_python_code_alpaca_n_ngrams_match_3": 3.4921875, "eval_python_code_alpaca_num_pred_words": 17.34375, "eval_python_code_alpaca_num_true_words": 18.421875, "eval_python_code_alpaca_perplexity": 4.393835063253559, "eval_python_code_alpaca_pred_num_tokens": 22.71875, "eval_python_code_alpaca_rouge_score": 0.6235496733658445, "eval_python_code_alpaca_runtime": 30.2515, "eval_python_code_alpaca_samples_per_second": 16.528, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6384379803435855, "eval_python_code_alpaca_token_set_f1_sem": 0.013733513180081178, "eval_python_code_alpaca_token_set_precision": 0.6238486403103327, "eval_python_code_alpaca_token_set_recall": 0.6592095867192101, "eval_python_code_alpaca_true_num_tokens": 23.4921875, "step": 850000 }, { "epoch": 49.71632450137451, "eval_wikibio_accuracy": 0.372296875, "eval_wikibio_bleu_score": 7.456548909100208, "eval_wikibio_bleu_score_sem": 0.6434138717407817, "eval_wikibio_emb_cos_sim": 0.6236448287963867, "eval_wikibio_emb_cos_sim_sem": 0.022115731611847878, "eval_wikibio_emb_top1_equal": 0.9375, "eval_wikibio_emb_top1_equal_sem": 0.02147948183119297, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6720194816589355, "eval_wikibio_n_ngrams_match_1": 15.921875, "eval_wikibio_n_ngrams_match_2": 5.734375, "eval_wikibio_n_ngrams_match_3": 2.3359375, "eval_wikibio_num_pred_words": 53.3984375, "eval_wikibio_num_true_words": 54.515625, "eval_wikibio_perplexity": 14.469159910882505, "eval_wikibio_pred_num_tokens": 108.3125, "eval_wikibio_rouge_score": 0.29530422676314205, "eval_wikibio_runtime": 31.2718, "eval_wikibio_samples_per_second": 15.989, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.328091452844346, "eval_wikibio_token_set_f1_sem": 0.01177572579698731, "eval_wikibio_token_set_precision": 0.2845292058801301, "eval_wikibio_token_set_recall": 0.42640518226897334, "eval_wikibio_true_num_tokens": 101.984375, "step": 850000 }, { "epoch": 49.71632450137451, "eval_msmarco_accuracy": 0.3915, "eval_msmarco_bleu_score": 15.999828322633281, "eval_msmarco_bleu_score_sem": 1.1751165143543922, "eval_msmarco_emb_cos_sim": 0.7989825010299683, "eval_msmarco_emb_cos_sim_sem": 0.015376691706478596, "eval_msmarco_emb_top1_equal": 0.9453125, "eval_msmarco_emb_top1_equal_sem": 0.020175758749246597, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.6907631158828735, "eval_msmarco_n_ngrams_match_1": 27.40625, "eval_msmarco_n_ngrams_match_2": 12.1796875, "eval_msmarco_n_ngrams_match_3": 6.5546875, "eval_msmarco_num_pred_words": 60.6171875, "eval_msmarco_num_true_words": 61.015625, "eval_msmarco_perplexity": 5.423617975341746, "eval_msmarco_pred_num_tokens": 83.4375, "eval_msmarco_rouge_score": 0.4322045831995608, "eval_msmarco_runtime": 25.0164, "eval_msmarco_samples_per_second": 19.987, "eval_msmarco_steps_per_second": 0.04, "eval_msmarco_token_set_f1": 0.459286020771838, "eval_msmarco_token_set_f1_sem": 0.013239512230876813, "eval_msmarco_token_set_precision": 0.4237180101061835, "eval_msmarco_token_set_recall": 0.518146818152641, "eval_msmarco_true_num_tokens": 78.8671875, "step": 850000 }, { "epoch": 49.72217348072761, "grad_norm": 0.19450530409812927, "learning_rate": 0.001, "loss": 1.8117, "step": 850100 }, { "epoch": 49.728022460080716, "grad_norm": 0.25059089064598083, "learning_rate": 0.001, "loss": 1.8097, "step": 850200 }, { "epoch": 49.73387143943382, "grad_norm": 0.19073432683944702, "learning_rate": 0.001, "loss": 1.8159, "step": 850300 }, { "epoch": 49.73972041878692, "grad_norm": 0.23206202685832977, "learning_rate": 0.001, "loss": 1.8111, "step": 850400 }, { "epoch": 49.74556939814003, "grad_norm": 0.2016201764345169, "learning_rate": 0.001, "loss": 1.8202, "step": 850500 }, { "epoch": 49.751418377493124, "grad_norm": 0.2867734730243683, "learning_rate": 0.001, "loss": 1.8157, "step": 850600 }, { "epoch": 49.75726735684623, "grad_norm": 0.22242912650108337, "learning_rate": 0.001, "loss": 1.8097, "step": 850700 }, { "epoch": 49.76311633619933, "grad_norm": 0.18775035440921783, "learning_rate": 0.001, "loss": 1.8188, "step": 850800 }, { "epoch": 49.768965315552435, "grad_norm": 0.24105790257453918, "learning_rate": 0.001, "loss": 1.816, "step": 850900 }, { "epoch": 49.77481429490554, "grad_norm": 0.26778051257133484, "learning_rate": 0.001, "loss": 1.8204, "step": 851000 }, { "epoch": 49.78066327425864, "grad_norm": 0.167306587100029, "learning_rate": 0.001, "loss": 1.8133, "step": 851100 }, { "epoch": 49.78651225361175, "grad_norm": 0.18950361013412476, "learning_rate": 0.001, "loss": 1.8165, "step": 851200 }, { "epoch": 49.79236123296485, "grad_norm": 0.17041799426078796, "learning_rate": 0.001, "loss": 1.8132, "step": 851300 }, { "epoch": 49.79821021231795, "grad_norm": 0.2056003361940384, "learning_rate": 0.001, "loss": 1.8208, "step": 851400 }, { "epoch": 49.80405919167105, "grad_norm": 0.20684027671813965, "learning_rate": 0.001, "loss": 1.8102, "step": 851500 }, { "epoch": 49.809908171024155, "grad_norm": 0.19417347013950348, "learning_rate": 0.001, "loss": 1.8125, "step": 851600 }, { "epoch": 49.81575715037726, "grad_norm": 0.2806686758995056, "learning_rate": 0.001, "loss": 1.8162, "step": 851700 }, { "epoch": 49.82160612973036, "grad_norm": 0.2343699336051941, "learning_rate": 0.001, "loss": 1.811, "step": 851800 }, { "epoch": 49.827455109083466, "grad_norm": 0.21059128642082214, "learning_rate": 0.001, "loss": 1.8139, "step": 851900 }, { "epoch": 49.83330408843657, "grad_norm": 0.14868304133415222, "learning_rate": 0.001, "loss": 1.8155, "step": 852000 }, { "epoch": 49.839153067789674, "grad_norm": 0.18515628576278687, "learning_rate": 0.001, "loss": 1.8172, "step": 852100 }, { "epoch": 49.84500204714277, "grad_norm": 0.16236519813537598, "learning_rate": 0.001, "loss": 1.8191, "step": 852200 }, { "epoch": 49.850851026495874, "grad_norm": 0.23943759500980377, "learning_rate": 0.001, "loss": 1.8102, "step": 852300 }, { "epoch": 49.85670000584898, "grad_norm": 0.1741102933883667, "learning_rate": 0.001, "loss": 1.8149, "step": 852400 }, { "epoch": 49.86254898520208, "grad_norm": 0.18765369057655334, "learning_rate": 0.001, "loss": 1.819, "step": 852500 }, { "epoch": 49.868397964555186, "grad_norm": 0.20202822983264923, "learning_rate": 0.001, "loss": 1.8215, "step": 852600 }, { "epoch": 49.87424694390829, "grad_norm": 0.21745994687080383, "learning_rate": 0.001, "loss": 1.8126, "step": 852700 }, { "epoch": 49.88009592326139, "grad_norm": 0.1992667019367218, "learning_rate": 0.001, "loss": 1.8198, "step": 852800 }, { "epoch": 49.8859449026145, "grad_norm": 0.17933714389801025, "learning_rate": 0.001, "loss": 1.814, "step": 852900 }, { "epoch": 49.891793881967594, "grad_norm": 0.2461792379617691, "learning_rate": 0.001, "loss": 1.8191, "step": 853000 }, { "epoch": 49.8976428613207, "grad_norm": 0.17844122648239136, "learning_rate": 0.001, "loss": 1.8088, "step": 853100 }, { "epoch": 49.9034918406738, "grad_norm": 0.22299131751060486, "learning_rate": 0.001, "loss": 1.8166, "step": 853200 }, { "epoch": 49.909340820026905, "grad_norm": 0.16200996935367584, "learning_rate": 0.001, "loss": 1.8096, "step": 853300 }, { "epoch": 49.91518979938001, "grad_norm": 0.1596326231956482, "learning_rate": 0.001, "loss": 1.8129, "step": 853400 }, { "epoch": 49.92103877873311, "grad_norm": 0.16422124207019806, "learning_rate": 0.001, "loss": 1.8189, "step": 853500 }, { "epoch": 49.92688775808622, "grad_norm": 0.20924773812294006, "learning_rate": 0.001, "loss": 1.8156, "step": 853600 }, { "epoch": 49.93273673743931, "grad_norm": 0.19163650274276733, "learning_rate": 0.001, "loss": 1.8187, "step": 853700 }, { "epoch": 49.93858571679242, "grad_norm": 0.18530459702014923, "learning_rate": 0.001, "loss": 1.8127, "step": 853800 }, { "epoch": 49.94443469614552, "grad_norm": 0.17362749576568604, "learning_rate": 0.001, "loss": 1.811, "step": 853900 }, { "epoch": 49.950283675498625, "grad_norm": 0.19935616850852966, "learning_rate": 0.001, "loss": 1.809, "step": 854000 }, { "epoch": 49.95613265485173, "grad_norm": 0.16997236013412476, "learning_rate": 0.001, "loss": 1.812, "step": 854100 }, { "epoch": 49.96198163420483, "grad_norm": 0.17931705713272095, "learning_rate": 0.001, "loss": 1.8135, "step": 854200 }, { "epoch": 49.967830613557936, "grad_norm": 0.21800263226032257, "learning_rate": 0.001, "loss": 1.8181, "step": 854300 }, { "epoch": 49.97367959291104, "grad_norm": 0.17488974332809448, "learning_rate": 0.001, "loss": 1.8151, "step": 854400 }, { "epoch": 49.97952857226414, "grad_norm": 0.24133193492889404, "learning_rate": 0.001, "loss": 1.8151, "step": 854500 }, { "epoch": 49.98537755161724, "grad_norm": 0.21661359071731567, "learning_rate": 0.001, "loss": 1.8102, "step": 854600 }, { "epoch": 49.991226530970344, "grad_norm": 0.17593632638454437, "learning_rate": 0.001, "loss": 1.8137, "step": 854700 }, { "epoch": 49.99707551032345, "grad_norm": 0.21883881092071533, "learning_rate": 0.001, "loss": 1.8197, "step": 854800 }, { "epoch": 50.00292448967655, "grad_norm": 0.23469886183738708, "learning_rate": 0.001, "loss": 1.8009, "step": 854900 }, { "epoch": 50.008773469029656, "grad_norm": 0.2272767722606659, "learning_rate": 0.001, "loss": 1.7968, "step": 855000 }, { "epoch": 50.01462244838276, "grad_norm": 0.22798390686511993, "learning_rate": 0.001, "loss": 1.7963, "step": 855100 }, { "epoch": 50.02047142773586, "grad_norm": 0.21079273521900177, "learning_rate": 0.001, "loss": 1.8049, "step": 855200 }, { "epoch": 50.02632040708896, "grad_norm": 0.2696229815483093, "learning_rate": 0.001, "loss": 1.7966, "step": 855300 }, { "epoch": 50.032169386442064, "grad_norm": 0.1925000697374344, "learning_rate": 0.001, "loss": 1.7904, "step": 855400 }, { "epoch": 50.03801836579517, "grad_norm": 0.25098440051078796, "learning_rate": 0.001, "loss": 1.7985, "step": 855500 }, { "epoch": 50.04386734514827, "grad_norm": 0.23407728970050812, "learning_rate": 0.001, "loss": 1.7962, "step": 855600 }, { "epoch": 50.049716324501375, "grad_norm": 0.22840525209903717, "learning_rate": 0.001, "loss": 1.8083, "step": 855700 }, { "epoch": 50.05556530385448, "grad_norm": 0.21270568668842316, "learning_rate": 0.001, "loss": 1.8008, "step": 855800 }, { "epoch": 50.06141428320758, "grad_norm": 0.2519147992134094, "learning_rate": 0.001, "loss": 1.8047, "step": 855900 }, { "epoch": 50.06726326256069, "grad_norm": 0.24309810996055603, "learning_rate": 0.001, "loss": 1.804, "step": 856000 }, { "epoch": 50.07311224191378, "grad_norm": 0.221232607960701, "learning_rate": 0.001, "loss": 1.8023, "step": 856100 }, { "epoch": 50.07896122126689, "grad_norm": 0.26714056730270386, "learning_rate": 0.001, "loss": 1.8019, "step": 856200 }, { "epoch": 50.08481020061999, "grad_norm": 0.26417478919029236, "learning_rate": 0.001, "loss": 1.8023, "step": 856300 }, { "epoch": 50.090659179973095, "grad_norm": 0.22777849435806274, "learning_rate": 0.001, "loss": 1.8025, "step": 856400 }, { "epoch": 50.0965081593262, "grad_norm": 0.22337794303894043, "learning_rate": 0.001, "loss": 1.8026, "step": 856500 }, { "epoch": 50.1023571386793, "grad_norm": 0.1990976184606552, "learning_rate": 0.001, "loss": 1.8005, "step": 856600 }, { "epoch": 50.108206118032406, "grad_norm": 0.1941041350364685, "learning_rate": 0.001, "loss": 1.8058, "step": 856700 }, { "epoch": 50.1140550973855, "grad_norm": 0.2340371012687683, "learning_rate": 0.001, "loss": 1.8125, "step": 856800 }, { "epoch": 50.11990407673861, "grad_norm": 0.2373354583978653, "learning_rate": 0.001, "loss": 1.8106, "step": 856900 }, { "epoch": 50.12575305609171, "grad_norm": 0.2503221035003662, "learning_rate": 0.001, "loss": 1.8045, "step": 857000 }, { "epoch": 50.131602035444814, "grad_norm": 0.2507171630859375, "learning_rate": 0.001, "loss": 1.797, "step": 857100 }, { "epoch": 50.13745101479792, "grad_norm": 0.2689734995365143, "learning_rate": 0.001, "loss": 1.8055, "step": 857200 }, { "epoch": 50.14329999415102, "grad_norm": 0.2676350772380829, "learning_rate": 0.001, "loss": 1.8002, "step": 857300 }, { "epoch": 50.149148973504126, "grad_norm": 0.22491636872291565, "learning_rate": 0.001, "loss": 1.8057, "step": 857400 }, { "epoch": 50.15499795285723, "grad_norm": 0.20455780625343323, "learning_rate": 0.001, "loss": 1.8022, "step": 857500 }, { "epoch": 50.160846932210326, "grad_norm": 0.21162033081054688, "learning_rate": 0.001, "loss": 1.805, "step": 857600 }, { "epoch": 50.16669591156343, "grad_norm": 0.21946336328983307, "learning_rate": 0.001, "loss": 1.8068, "step": 857700 }, { "epoch": 50.172544890916534, "grad_norm": 0.19948041439056396, "learning_rate": 0.001, "loss": 1.8033, "step": 857800 }, { "epoch": 50.17839387026964, "grad_norm": 0.3168799579143524, "learning_rate": 0.001, "loss": 1.8025, "step": 857900 }, { "epoch": 50.18424284962274, "grad_norm": 0.2615254819393158, "learning_rate": 0.001, "loss": 1.8044, "step": 858000 }, { "epoch": 50.190091828975845, "grad_norm": 0.23780569434165955, "learning_rate": 0.001, "loss": 1.8065, "step": 858100 }, { "epoch": 50.19594080832895, "grad_norm": 0.27939072251319885, "learning_rate": 0.001, "loss": 1.8056, "step": 858200 }, { "epoch": 50.20178978768205, "grad_norm": 0.25240644812583923, "learning_rate": 0.001, "loss": 1.8127, "step": 858300 }, { "epoch": 50.20763876703515, "grad_norm": 0.30814680457115173, "learning_rate": 0.001, "loss": 1.8018, "step": 858400 }, { "epoch": 50.21348774638825, "grad_norm": 0.2684856355190277, "learning_rate": 0.001, "loss": 1.8078, "step": 858500 }, { "epoch": 50.21933672574136, "grad_norm": 0.20924712717533112, "learning_rate": 0.001, "loss": 1.8048, "step": 858600 }, { "epoch": 50.22518570509446, "grad_norm": 0.2209189236164093, "learning_rate": 0.001, "loss": 1.8045, "step": 858700 }, { "epoch": 50.231034684447565, "grad_norm": 0.2046547681093216, "learning_rate": 0.001, "loss": 1.8053, "step": 858800 }, { "epoch": 50.23688366380067, "grad_norm": 0.25163450837135315, "learning_rate": 0.001, "loss": 1.8017, "step": 858900 }, { "epoch": 50.24273264315377, "grad_norm": 0.2585326135158539, "learning_rate": 0.001, "loss": 1.8027, "step": 859000 }, { "epoch": 50.248581622506876, "grad_norm": 0.22656095027923584, "learning_rate": 0.001, "loss": 1.8044, "step": 859100 }, { "epoch": 50.25443060185997, "grad_norm": 0.24712517857551575, "learning_rate": 0.001, "loss": 1.8066, "step": 859200 }, { "epoch": 50.26027958121308, "grad_norm": 0.23963028192520142, "learning_rate": 0.001, "loss": 1.8088, "step": 859300 }, { "epoch": 50.26612856056618, "grad_norm": 0.2874285876750946, "learning_rate": 0.001, "loss": 1.8129, "step": 859400 }, { "epoch": 50.271977539919284, "grad_norm": 0.2113020420074463, "learning_rate": 0.001, "loss": 1.811, "step": 859500 }, { "epoch": 50.27782651927239, "grad_norm": 0.20709995925426483, "learning_rate": 0.001, "loss": 1.8069, "step": 859600 }, { "epoch": 50.28367549862549, "grad_norm": 0.23524270951747894, "learning_rate": 0.001, "loss": 1.8099, "step": 859700 }, { "epoch": 50.289524477978595, "grad_norm": 0.19203004240989685, "learning_rate": 0.001, "loss": 1.8085, "step": 859800 }, { "epoch": 50.29537345733169, "grad_norm": 0.24430225789546967, "learning_rate": 0.001, "loss": 1.8082, "step": 859900 }, { "epoch": 50.301222436684796, "grad_norm": 0.20951421558856964, "learning_rate": 0.001, "loss": 1.8027, "step": 860000 }, { "epoch": 50.3070714160379, "grad_norm": 0.20376397669315338, "learning_rate": 0.001, "loss": 1.8083, "step": 860100 }, { "epoch": 50.312920395391004, "grad_norm": 0.26319220662117004, "learning_rate": 0.001, "loss": 1.8094, "step": 860200 }, { "epoch": 50.31876937474411, "grad_norm": 0.26384949684143066, "learning_rate": 0.001, "loss": 1.8121, "step": 860300 }, { "epoch": 50.32461835409721, "grad_norm": 0.29800674319267273, "learning_rate": 0.001, "loss": 1.8078, "step": 860400 }, { "epoch": 50.330467333450315, "grad_norm": 0.23136670887470245, "learning_rate": 0.001, "loss": 1.8136, "step": 860500 }, { "epoch": 50.33631631280342, "grad_norm": 0.281358003616333, "learning_rate": 0.001, "loss": 1.8047, "step": 860600 }, { "epoch": 50.342165292156515, "grad_norm": 0.21767036616802216, "learning_rate": 0.001, "loss": 1.8056, "step": 860700 }, { "epoch": 50.34801427150962, "grad_norm": 0.2328200340270996, "learning_rate": 0.001, "loss": 1.8078, "step": 860800 }, { "epoch": 50.35386325086272, "grad_norm": 0.21859100461006165, "learning_rate": 0.001, "loss": 1.8093, "step": 860900 }, { "epoch": 50.35971223021583, "grad_norm": 0.20335625112056732, "learning_rate": 0.001, "loss": 1.8116, "step": 861000 }, { "epoch": 50.36556120956893, "grad_norm": 0.25588440895080566, "learning_rate": 0.001, "loss": 1.8086, "step": 861100 }, { "epoch": 50.371410188922034, "grad_norm": 0.30600476264953613, "learning_rate": 0.001, "loss": 1.8063, "step": 861200 }, { "epoch": 50.37725916827514, "grad_norm": 0.25017035007476807, "learning_rate": 0.001, "loss": 1.8089, "step": 861300 }, { "epoch": 50.38310814762824, "grad_norm": 0.2659381330013275, "learning_rate": 0.001, "loss": 1.8128, "step": 861400 }, { "epoch": 50.38895712698134, "grad_norm": 0.30818551778793335, "learning_rate": 0.001, "loss": 1.8095, "step": 861500 }, { "epoch": 50.39480610633444, "grad_norm": 0.30659735202789307, "learning_rate": 0.001, "loss": 1.8126, "step": 861600 }, { "epoch": 50.400655085687546, "grad_norm": 0.22533798217773438, "learning_rate": 0.001, "loss": 1.808, "step": 861700 }, { "epoch": 50.40650406504065, "grad_norm": 0.19762259721755981, "learning_rate": 0.001, "loss": 1.8055, "step": 861800 }, { "epoch": 50.412353044393754, "grad_norm": 0.24758468568325043, "learning_rate": 0.001, "loss": 1.8043, "step": 861900 }, { "epoch": 50.41820202374686, "grad_norm": 0.314604789018631, "learning_rate": 0.001, "loss": 1.8104, "step": 862000 }, { "epoch": 50.42405100309996, "grad_norm": 0.20843498408794403, "learning_rate": 0.001, "loss": 1.8105, "step": 862100 }, { "epoch": 50.429899982453065, "grad_norm": 0.22199158370494843, "learning_rate": 0.001, "loss": 1.8044, "step": 862200 }, { "epoch": 50.43574896180616, "grad_norm": 0.2034108191728592, "learning_rate": 0.001, "loss": 1.8065, "step": 862300 }, { "epoch": 50.441597941159266, "grad_norm": 0.2337125837802887, "learning_rate": 0.001, "loss": 1.8011, "step": 862400 }, { "epoch": 50.44744692051237, "grad_norm": 0.24759352207183838, "learning_rate": 0.001, "loss": 1.8042, "step": 862500 }, { "epoch": 50.45329589986547, "grad_norm": 0.26785194873809814, "learning_rate": 0.001, "loss": 1.8119, "step": 862600 }, { "epoch": 50.45914487921858, "grad_norm": 0.24967366456985474, "learning_rate": 0.001, "loss": 1.8052, "step": 862700 }, { "epoch": 50.46499385857168, "grad_norm": 0.2392304241657257, "learning_rate": 0.001, "loss": 1.8101, "step": 862800 }, { "epoch": 50.470842837924785, "grad_norm": 0.19949263334274292, "learning_rate": 0.001, "loss": 1.8135, "step": 862900 }, { "epoch": 50.47669181727788, "grad_norm": 0.23953354358673096, "learning_rate": 0.001, "loss": 1.8079, "step": 863000 }, { "epoch": 50.482540796630985, "grad_norm": 0.2772960364818573, "learning_rate": 0.001, "loss": 1.8086, "step": 863100 }, { "epoch": 50.48838977598409, "grad_norm": 0.23988045752048492, "learning_rate": 0.001, "loss": 1.812, "step": 863200 }, { "epoch": 50.49423875533719, "grad_norm": 0.23829962313175201, "learning_rate": 0.001, "loss": 1.8051, "step": 863300 }, { "epoch": 50.5000877346903, "grad_norm": 0.297387957572937, "learning_rate": 0.001, "loss": 1.8029, "step": 863400 }, { "epoch": 50.5059367140434, "grad_norm": 0.2883743345737457, "learning_rate": 0.001, "loss": 1.8101, "step": 863500 }, { "epoch": 50.511785693396504, "grad_norm": 0.2644404172897339, "learning_rate": 0.001, "loss": 1.8146, "step": 863600 }, { "epoch": 50.51763467274961, "grad_norm": 0.20468077063560486, "learning_rate": 0.001, "loss": 1.8092, "step": 863700 }, { "epoch": 50.523483652102705, "grad_norm": 0.22279192507266998, "learning_rate": 0.001, "loss": 1.8103, "step": 863800 }, { "epoch": 50.52933263145581, "grad_norm": 0.24688786268234253, "learning_rate": 0.001, "loss": 1.8102, "step": 863900 }, { "epoch": 50.53518161080891, "grad_norm": 0.20783323049545288, "learning_rate": 0.001, "loss": 1.8117, "step": 864000 }, { "epoch": 50.541030590162016, "grad_norm": 0.2582106590270996, "learning_rate": 0.001, "loss": 1.8121, "step": 864100 }, { "epoch": 50.54687956951512, "grad_norm": 0.2222825139760971, "learning_rate": 0.001, "loss": 1.8109, "step": 864200 }, { "epoch": 50.552728548868224, "grad_norm": 0.2606070935726166, "learning_rate": 0.001, "loss": 1.8068, "step": 864300 }, { "epoch": 50.55857752822133, "grad_norm": 0.21967680752277374, "learning_rate": 0.001, "loss": 1.8053, "step": 864400 }, { "epoch": 50.56442650757443, "grad_norm": 0.23322513699531555, "learning_rate": 0.001, "loss": 1.8134, "step": 864500 }, { "epoch": 50.57027548692753, "grad_norm": 0.22236602008342743, "learning_rate": 0.001, "loss": 1.8085, "step": 864600 }, { "epoch": 50.57612446628063, "grad_norm": 0.2107338011264801, "learning_rate": 0.001, "loss": 1.8065, "step": 864700 }, { "epoch": 50.581973445633736, "grad_norm": 0.24633049964904785, "learning_rate": 0.001, "loss": 1.8092, "step": 864800 }, { "epoch": 50.58782242498684, "grad_norm": 0.2520010471343994, "learning_rate": 0.001, "loss": 1.8049, "step": 864900 }, { "epoch": 50.59367140433994, "grad_norm": 0.21707899868488312, "learning_rate": 0.001, "loss": 1.8065, "step": 865000 }, { "epoch": 50.59952038369305, "grad_norm": 0.412011057138443, "learning_rate": 0.001, "loss": 1.8098, "step": 865100 }, { "epoch": 50.60536936304615, "grad_norm": 0.27167728543281555, "learning_rate": 0.001, "loss": 1.8153, "step": 865200 }, { "epoch": 50.611218342399255, "grad_norm": 0.1886942833662033, "learning_rate": 0.001, "loss": 1.81, "step": 865300 }, { "epoch": 50.61706732175235, "grad_norm": 0.24563159048557281, "learning_rate": 0.001, "loss": 1.8147, "step": 865400 }, { "epoch": 50.622916301105455, "grad_norm": 0.24839337170124054, "learning_rate": 0.001, "loss": 1.8151, "step": 865500 }, { "epoch": 50.62876528045856, "grad_norm": 0.24975714087486267, "learning_rate": 0.001, "loss": 1.8113, "step": 865600 }, { "epoch": 50.63461425981166, "grad_norm": 0.2463103085756302, "learning_rate": 0.001, "loss": 1.8094, "step": 865700 }, { "epoch": 50.64046323916477, "grad_norm": 0.21446460485458374, "learning_rate": 0.001, "loss": 1.8073, "step": 865800 }, { "epoch": 50.64631221851787, "grad_norm": 0.22495536506175995, "learning_rate": 0.001, "loss": 1.8146, "step": 865900 }, { "epoch": 50.652161197870974, "grad_norm": 0.20351524651050568, "learning_rate": 0.001, "loss": 1.8079, "step": 866000 }, { "epoch": 50.65801017722407, "grad_norm": 0.21401691436767578, "learning_rate": 0.001, "loss": 1.8127, "step": 866100 }, { "epoch": 50.663859156577175, "grad_norm": 0.2448999434709549, "learning_rate": 0.001, "loss": 1.8043, "step": 866200 }, { "epoch": 50.66970813593028, "grad_norm": 0.2650894522666931, "learning_rate": 0.001, "loss": 1.8083, "step": 866300 }, { "epoch": 50.67555711528338, "grad_norm": 0.22860708832740784, "learning_rate": 0.001, "loss": 1.8083, "step": 866400 }, { "epoch": 50.681406094636486, "grad_norm": 0.268598347902298, "learning_rate": 0.001, "loss": 1.812, "step": 866500 }, { "epoch": 50.68725507398959, "grad_norm": 0.2406046986579895, "learning_rate": 0.001, "loss": 1.8167, "step": 866600 }, { "epoch": 50.693104053342694, "grad_norm": 0.27081242203712463, "learning_rate": 0.001, "loss": 1.8081, "step": 866700 }, { "epoch": 50.6989530326958, "grad_norm": 0.26558446884155273, "learning_rate": 0.001, "loss": 1.8074, "step": 866800 }, { "epoch": 50.704802012048894, "grad_norm": 0.24764442443847656, "learning_rate": 0.001, "loss": 1.8076, "step": 866900 }, { "epoch": 50.710650991402, "grad_norm": 0.24268490076065063, "learning_rate": 0.001, "loss": 1.8088, "step": 867000 }, { "epoch": 50.7164999707551, "grad_norm": 0.2997998595237732, "learning_rate": 0.001, "loss": 1.8116, "step": 867100 }, { "epoch": 50.722348950108206, "grad_norm": 0.2914184331893921, "learning_rate": 0.001, "loss": 1.8116, "step": 867200 }, { "epoch": 50.72819792946131, "grad_norm": 0.20055516064167023, "learning_rate": 0.001, "loss": 1.813, "step": 867300 }, { "epoch": 50.73404690881441, "grad_norm": 0.25379350781440735, "learning_rate": 0.001, "loss": 1.8098, "step": 867400 }, { "epoch": 50.73989588816752, "grad_norm": 0.27099576592445374, "learning_rate": 0.001, "loss": 1.8092, "step": 867500 }, { "epoch": 50.74574486752062, "grad_norm": 0.2062646746635437, "learning_rate": 0.001, "loss": 1.8097, "step": 867600 }, { "epoch": 50.75159384687372, "grad_norm": 0.238939568400383, "learning_rate": 0.001, "loss": 1.8095, "step": 867700 }, { "epoch": 50.75744282622682, "grad_norm": 0.23938138782978058, "learning_rate": 0.001, "loss": 1.8124, "step": 867800 }, { "epoch": 50.763291805579925, "grad_norm": 0.24307915568351746, "learning_rate": 0.001, "loss": 1.8138, "step": 867900 }, { "epoch": 50.76914078493303, "grad_norm": 0.2643444240093231, "learning_rate": 0.001, "loss": 1.8097, "step": 868000 }, { "epoch": 50.77498976428613, "grad_norm": 0.2440427988767624, "learning_rate": 0.001, "loss": 1.8098, "step": 868100 }, { "epoch": 50.78083874363924, "grad_norm": 0.22349992394447327, "learning_rate": 0.001, "loss": 1.8121, "step": 868200 }, { "epoch": 50.78668772299234, "grad_norm": 0.206733837723732, "learning_rate": 0.001, "loss": 1.8086, "step": 868300 }, { "epoch": 50.792536702345444, "grad_norm": 0.26691940426826477, "learning_rate": 0.001, "loss": 1.8077, "step": 868400 }, { "epoch": 50.79838568169854, "grad_norm": 0.2643040716648102, "learning_rate": 0.001, "loss": 1.8152, "step": 868500 }, { "epoch": 50.804234661051645, "grad_norm": 0.23502038419246674, "learning_rate": 0.001, "loss": 1.81, "step": 868600 }, { "epoch": 50.81008364040475, "grad_norm": 0.2975026071071625, "learning_rate": 0.001, "loss": 1.812, "step": 868700 }, { "epoch": 50.81593261975785, "grad_norm": 0.22237743437290192, "learning_rate": 0.001, "loss": 1.8099, "step": 868800 }, { "epoch": 50.821781599110956, "grad_norm": 0.22151629626750946, "learning_rate": 0.001, "loss": 1.8105, "step": 868900 }, { "epoch": 50.82763057846406, "grad_norm": 0.2301187664270401, "learning_rate": 0.001, "loss": 1.8125, "step": 869000 }, { "epoch": 50.833479557817164, "grad_norm": 0.216695636510849, "learning_rate": 0.001, "loss": 1.812, "step": 869100 }, { "epoch": 50.83932853717026, "grad_norm": 0.29520365595817566, "learning_rate": 0.001, "loss": 1.8207, "step": 869200 }, { "epoch": 50.845177516523364, "grad_norm": 0.22615595161914825, "learning_rate": 0.001, "loss": 1.8097, "step": 869300 }, { "epoch": 50.85102649587647, "grad_norm": 0.28372547030448914, "learning_rate": 0.001, "loss": 1.8159, "step": 869400 }, { "epoch": 50.85687547522957, "grad_norm": 0.3124528229236603, "learning_rate": 0.001, "loss": 1.8123, "step": 869500 }, { "epoch": 50.862724454582676, "grad_norm": 0.23940417170524597, "learning_rate": 0.001, "loss": 1.8116, "step": 869600 }, { "epoch": 50.86857343393578, "grad_norm": 0.3349553644657135, "learning_rate": 0.001, "loss": 1.8128, "step": 869700 }, { "epoch": 50.87442241328888, "grad_norm": 0.21582290530204773, "learning_rate": 0.001, "loss": 1.8135, "step": 869800 }, { "epoch": 50.88027139264199, "grad_norm": 0.23268067836761475, "learning_rate": 0.001, "loss": 1.8122, "step": 869900 }, { "epoch": 50.886120371995084, "grad_norm": 0.3528066873550415, "learning_rate": 0.001, "loss": 1.8157, "step": 870000 }, { "epoch": 50.89196935134819, "grad_norm": 0.23177000880241394, "learning_rate": 0.001, "loss": 1.8145, "step": 870100 }, { "epoch": 50.89781833070129, "grad_norm": 0.22068308293819427, "learning_rate": 0.001, "loss": 1.8065, "step": 870200 }, { "epoch": 50.903667310054395, "grad_norm": 0.22941842675209045, "learning_rate": 0.001, "loss": 1.8204, "step": 870300 }, { "epoch": 50.9095162894075, "grad_norm": 0.20829637348651886, "learning_rate": 0.001, "loss": 1.8186, "step": 870400 }, { "epoch": 50.9153652687606, "grad_norm": 0.2440720945596695, "learning_rate": 0.001, "loss": 1.8112, "step": 870500 }, { "epoch": 50.921214248113706, "grad_norm": 0.24157975614070892, "learning_rate": 0.001, "loss": 1.8136, "step": 870600 }, { "epoch": 50.92706322746681, "grad_norm": 0.25836724042892456, "learning_rate": 0.001, "loss": 1.8142, "step": 870700 }, { "epoch": 50.93291220681991, "grad_norm": 0.23420004546642303, "learning_rate": 0.001, "loss": 1.8073, "step": 870800 }, { "epoch": 50.93876118617301, "grad_norm": 0.21725483238697052, "learning_rate": 0.001, "loss": 1.8177, "step": 870900 }, { "epoch": 50.944610165526115, "grad_norm": 0.24331730604171753, "learning_rate": 0.001, "loss": 1.8177, "step": 871000 }, { "epoch": 50.95045914487922, "grad_norm": 0.1962728351354599, "learning_rate": 0.001, "loss": 1.8103, "step": 871100 }, { "epoch": 50.95630812423232, "grad_norm": 0.2386530637741089, "learning_rate": 0.001, "loss": 1.8149, "step": 871200 }, { "epoch": 50.962157103585426, "grad_norm": 0.20698215067386627, "learning_rate": 0.001, "loss": 1.8145, "step": 871300 }, { "epoch": 50.96800608293853, "grad_norm": 0.24861010909080505, "learning_rate": 0.001, "loss": 1.8148, "step": 871400 }, { "epoch": 50.97385506229163, "grad_norm": 0.22370637953281403, "learning_rate": 0.001, "loss": 1.8135, "step": 871500 }, { "epoch": 50.97970404164473, "grad_norm": 0.24812699854373932, "learning_rate": 0.001, "loss": 1.8154, "step": 871600 }, { "epoch": 50.985553020997834, "grad_norm": 0.22494834661483765, "learning_rate": 0.001, "loss": 1.8124, "step": 871700 }, { "epoch": 50.99140200035094, "grad_norm": 0.2414730340242386, "learning_rate": 0.001, "loss": 1.8088, "step": 871800 }, { "epoch": 50.99725097970404, "grad_norm": 0.21753089129924774, "learning_rate": 0.001, "loss": 1.8134, "step": 871900 }, { "epoch": 51.003099959057145, "grad_norm": 0.14339300990104675, "learning_rate": 0.001, "loss": 1.8087, "step": 872000 }, { "epoch": 51.00894893841025, "grad_norm": 0.22544410824775696, "learning_rate": 0.001, "loss": 1.7988, "step": 872100 }, { "epoch": 51.01479791776335, "grad_norm": 0.13396792113780975, "learning_rate": 0.001, "loss": 1.797, "step": 872200 }, { "epoch": 51.02064689711645, "grad_norm": 0.18314924836158752, "learning_rate": 0.001, "loss": 1.8019, "step": 872300 }, { "epoch": 51.02649587646955, "grad_norm": 0.16005684435367584, "learning_rate": 0.001, "loss": 1.7955, "step": 872400 }, { "epoch": 51.03234485582266, "grad_norm": 0.1604052633047104, "learning_rate": 0.001, "loss": 1.7986, "step": 872500 }, { "epoch": 51.03819383517576, "grad_norm": 0.17799852788448334, "learning_rate": 0.001, "loss": 1.7949, "step": 872600 }, { "epoch": 51.044042814528865, "grad_norm": 0.22228729724884033, "learning_rate": 0.001, "loss": 1.8022, "step": 872700 }, { "epoch": 51.04989179388197, "grad_norm": 0.1704215556383133, "learning_rate": 0.001, "loss": 1.7981, "step": 872800 }, { "epoch": 51.05574077323507, "grad_norm": 0.24100475013256073, "learning_rate": 0.001, "loss": 1.8066, "step": 872900 }, { "epoch": 51.061589752588176, "grad_norm": 0.13723686337471008, "learning_rate": 0.001, "loss": 1.7985, "step": 873000 }, { "epoch": 51.06743873194127, "grad_norm": 0.23298054933547974, "learning_rate": 0.001, "loss": 1.7984, "step": 873100 }, { "epoch": 51.07328771129438, "grad_norm": 0.20345760881900787, "learning_rate": 0.001, "loss": 1.8019, "step": 873200 }, { "epoch": 51.07913669064748, "grad_norm": 0.21801237761974335, "learning_rate": 0.001, "loss": 1.7997, "step": 873300 }, { "epoch": 51.084985670000584, "grad_norm": 0.22541336715221405, "learning_rate": 0.001, "loss": 1.8007, "step": 873400 }, { "epoch": 51.09083464935369, "grad_norm": 0.18778972327709198, "learning_rate": 0.001, "loss": 1.8, "step": 873500 }, { "epoch": 51.09668362870679, "grad_norm": 0.1842224895954132, "learning_rate": 0.001, "loss": 1.8059, "step": 873600 }, { "epoch": 51.102532608059896, "grad_norm": 0.20860163867473602, "learning_rate": 0.001, "loss": 1.8026, "step": 873700 }, { "epoch": 51.108381587413, "grad_norm": 0.16279587149620056, "learning_rate": 0.001, "loss": 1.8012, "step": 873800 }, { "epoch": 51.114230566766096, "grad_norm": 0.19184158742427826, "learning_rate": 0.001, "loss": 1.7944, "step": 873900 }, { "epoch": 51.1200795461192, "grad_norm": 0.17453444004058838, "learning_rate": 0.001, "loss": 1.7998, "step": 874000 }, { "epoch": 51.125928525472304, "grad_norm": 0.17941969633102417, "learning_rate": 0.001, "loss": 1.799, "step": 874100 }, { "epoch": 51.13177750482541, "grad_norm": 0.18829591572284698, "learning_rate": 0.001, "loss": 1.7945, "step": 874200 }, { "epoch": 51.13762648417851, "grad_norm": 0.1786964386701584, "learning_rate": 0.001, "loss": 1.8008, "step": 874300 }, { "epoch": 51.143475463531615, "grad_norm": 0.1773320734500885, "learning_rate": 0.001, "loss": 1.8055, "step": 874400 }, { "epoch": 51.14932444288472, "grad_norm": 0.26327642798423767, "learning_rate": 0.001, "loss": 1.803, "step": 874500 }, { "epoch": 51.15517342223782, "grad_norm": 0.17164906859397888, "learning_rate": 0.001, "loss": 1.8027, "step": 874600 }, { "epoch": 51.16102240159092, "grad_norm": 0.16117927432060242, "learning_rate": 0.001, "loss": 1.7963, "step": 874700 }, { "epoch": 51.16687138094402, "grad_norm": 0.21951787173748016, "learning_rate": 0.001, "loss": 1.8031, "step": 874800 }, { "epoch": 51.17272036029713, "grad_norm": 0.1890152245759964, "learning_rate": 0.001, "loss": 1.8062, "step": 874900 }, { "epoch": 51.17856933965023, "grad_norm": 0.1652529388666153, "learning_rate": 0.001, "loss": 1.8002, "step": 875000 }, { "epoch": 51.17856933965023, "eval_ag_news_accuracy": 0.2446875, "eval_ag_news_bleu_score": 7.892756751893138, "eval_ag_news_bleu_score_sem": 0.6370146180808292, "eval_ag_news_emb_cos_sim": 0.7342592477798462, "eval_ag_news_emb_cos_sim_sem": 0.012618889100849628, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.724724054336548, "eval_ag_news_n_ngrams_match_1": 14.4765625, "eval_ag_news_n_ngrams_match_2": 4.578125, "eval_ag_news_n_ngrams_match_3": 1.8828125, "eval_ag_news_num_pred_words": 45.375, "eval_ag_news_num_true_words": 46.6328125, "eval_ag_news_perplexity": 15.252204566400302, "eval_ag_news_pred_num_tokens": 67.671875, "eval_ag_news_rouge_score": 0.30774865311978206, "eval_ag_news_runtime": 38.6895, "eval_ag_news_samples_per_second": 12.923, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3418884125709374, "eval_ag_news_token_set_f1_sem": 0.010381987692195472, "eval_ag_news_token_set_precision": 0.32038056248685043, "eval_ag_news_token_set_recall": 0.37289815091719775, "eval_ag_news_true_num_tokens": 64.8984375, "step": 875000 }, { "epoch": 51.17856933965023, "eval_anthropic_toxic_prompts_accuracy": 0.107265625, "eval_anthropic_toxic_prompts_bleu_score": 43.70437720219895, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.50626615032546, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8943698406219482, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008625678718090057, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02934655810211727, "eval_anthropic_toxic_prompts_loss": 1.2935930490493774, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.75, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.1875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.03125, "eval_anthropic_toxic_prompts_num_pred_words": 15.0859375, "eval_anthropic_toxic_prompts_num_true_words": 15.2421875, "eval_anthropic_toxic_prompts_perplexity": 3.645862813783919, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.640625, "eval_anthropic_toxic_prompts_rouge_score": 0.7015176380450461, "eval_anthropic_toxic_prompts_runtime": 30.1233, "eval_anthropic_toxic_prompts_samples_per_second": 16.598, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.7141464661203993, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01761556791097893, "eval_anthropic_toxic_prompts_token_set_precision": 0.7139153116505452, "eval_anthropic_toxic_prompts_token_set_recall": 0.722328127399216, "eval_anthropic_toxic_prompts_true_num_tokens": 18.75, "step": 875000 }, { "epoch": 51.17856933965023, "eval_arxiv_accuracy": 0.375890625, "eval_arxiv_bleu_score": 1.8624310205595946, "eval_arxiv_bleu_score_sem": 0.17269094123505987, "eval_arxiv_emb_cos_sim": 0.4825911521911621, "eval_arxiv_emb_cos_sim_sem": 0.019347533583641052, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4295737743377686, "eval_arxiv_n_ngrams_match_1": 13.8671875, "eval_arxiv_n_ngrams_match_2": 2.375, "eval_arxiv_n_ngrams_match_3": 0.5703125, "eval_arxiv_num_pred_words": 55.3671875, "eval_arxiv_num_true_words": 85.8359375, "eval_arxiv_perplexity": 30.86348513642898, "eval_arxiv_pred_num_tokens": 125.3515625, "eval_arxiv_rouge_score": 0.1822725377985121, "eval_arxiv_runtime": 31.4119, "eval_arxiv_samples_per_second": 15.918, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.19209277986087342, "eval_arxiv_token_set_f1_sem": 0.008839058266936954, "eval_arxiv_token_set_precision": 0.13356872230012187, "eval_arxiv_token_set_recall": 0.4215234076615725, "eval_arxiv_true_num_tokens": 125.28125, "step": 875000 }, { "epoch": 51.17856933965023, "eval_python_code_alpaca_accuracy": 0.132359375, "eval_python_code_alpaca_bleu_score": 28.85779146051111, "eval_python_code_alpaca_bleu_score_sem": 1.5596315304171096, "eval_python_code_alpaca_emb_cos_sim": 0.8805035352706909, "eval_python_code_alpaca_emb_cos_sim_sem": 0.007472079247236252, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4325209856033325, "eval_python_code_alpaca_n_ngrams_match_1": 10.5078125, "eval_python_code_alpaca_n_ngrams_match_2": 5.734375, "eval_python_code_alpaca_n_ngrams_match_3": 3.28125, "eval_python_code_alpaca_num_pred_words": 17.203125, "eval_python_code_alpaca_num_true_words": 18.3671875, "eval_python_code_alpaca_perplexity": 4.189246922178577, "eval_python_code_alpaca_pred_num_tokens": 22.4453125, "eval_python_code_alpaca_rouge_score": 0.6073529068105428, "eval_python_code_alpaca_runtime": 30.8483, "eval_python_code_alpaca_samples_per_second": 16.208, "eval_python_code_alpaca_steps_per_second": 0.032, "eval_python_code_alpaca_token_set_f1": 0.6251955898637028, "eval_python_code_alpaca_token_set_f1_sem": 0.012606995695644807, "eval_python_code_alpaca_token_set_precision": 0.609548742259098, "eval_python_code_alpaca_token_set_recall": 0.6463248396161202, "eval_python_code_alpaca_true_num_tokens": 23.390625, "step": 875000 }, { "epoch": 51.17856933965023, "eval_wikibio_accuracy": 0.36665625, "eval_wikibio_bleu_score": 8.794507491965401, "eval_wikibio_bleu_score_sem": 0.7487302420930714, "eval_wikibio_emb_cos_sim": 0.6432549953460693, "eval_wikibio_emb_cos_sim_sem": 0.021936563774943352, "eval_wikibio_emb_top1_equal": 0.953125, "eval_wikibio_emb_top1_equal_sem": 0.01875615119934082, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.706087350845337, "eval_wikibio_n_ngrams_match_1": 16.296875, "eval_wikibio_n_ngrams_match_2": 5.8125, "eval_wikibio_n_ngrams_match_3": 2.359375, "eval_wikibio_num_pred_words": 49.75, "eval_wikibio_num_true_words": 51.6875, "eval_wikibio_perplexity": 14.970586123108577, "eval_wikibio_pred_num_tokens": 100.6171875, "eval_wikibio_rouge_score": 0.3239063911858102, "eval_wikibio_runtime": 31.1961, "eval_wikibio_samples_per_second": 16.028, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.34766960280044, "eval_wikibio_token_set_f1_sem": 0.01171564138982296, "eval_wikibio_token_set_precision": 0.31194428864052026, "eval_wikibio_token_set_recall": 0.4302882569549509, "eval_wikibio_true_num_tokens": 97.46875, "step": 875000 }, { "epoch": 51.17856933965023, "eval_msmarco_accuracy": 0.397703125, "eval_msmarco_bleu_score": 19.516688920930942, "eval_msmarco_bleu_score_sem": 1.7372272500417696, "eval_msmarco_emb_cos_sim": 0.7779793739318848, "eval_msmarco_emb_cos_sim_sem": 0.01803327538073063, "eval_msmarco_emb_top1_equal": 0.921875, "eval_msmarco_emb_top1_equal_sem": 0.023813825100660324, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.735066533088684, "eval_msmarco_n_ngrams_match_1": 28.2578125, "eval_msmarco_n_ngrams_match_2": 14.0703125, "eval_msmarco_n_ngrams_match_3": 8.8125, "eval_msmarco_num_pred_words": 61.0703125, "eval_msmarco_num_true_words": 61.15625, "eval_msmarco_perplexity": 5.669304987628758, "eval_msmarco_pred_num_tokens": 82.2109375, "eval_msmarco_rouge_score": 0.4537434985989096, "eval_msmarco_runtime": 25.9418, "eval_msmarco_samples_per_second": 19.274, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.47921018443059665, "eval_msmarco_token_set_f1_sem": 0.01577339148322384, "eval_msmarco_token_set_precision": 0.4427306898379662, "eval_msmarco_token_set_recall": 0.5408936761740762, "eval_msmarco_true_num_tokens": 79.171875, "step": 875000 }, { "epoch": 51.184418319003335, "grad_norm": 0.23191426694393158, "learning_rate": 0.001, "loss": 1.8017, "step": 875100 }, { "epoch": 51.19026729835644, "grad_norm": 0.15896764397621155, "learning_rate": 0.001, "loss": 1.805, "step": 875200 }, { "epoch": 51.19611627770954, "grad_norm": 0.22529450058937073, "learning_rate": 0.001, "loss": 1.8016, "step": 875300 }, { "epoch": 51.20196525706264, "grad_norm": 0.19902220368385315, "learning_rate": 0.001, "loss": 1.8079, "step": 875400 }, { "epoch": 51.20781423641574, "grad_norm": 0.17349737882614136, "learning_rate": 0.001, "loss": 1.8055, "step": 875500 }, { "epoch": 51.21366321576885, "grad_norm": 0.22425870597362518, "learning_rate": 0.001, "loss": 1.8008, "step": 875600 }, { "epoch": 51.21951219512195, "grad_norm": 0.18560269474983215, "learning_rate": 0.001, "loss": 1.802, "step": 875700 }, { "epoch": 51.225361174475054, "grad_norm": 0.17858773469924927, "learning_rate": 0.001, "loss": 1.8044, "step": 875800 }, { "epoch": 51.23121015382816, "grad_norm": 0.15120437741279602, "learning_rate": 0.001, "loss": 1.8091, "step": 875900 }, { "epoch": 51.23705913318126, "grad_norm": 0.14111557602882385, "learning_rate": 0.001, "loss": 1.8081, "step": 876000 }, { "epoch": 51.242908112534366, "grad_norm": 0.1783653348684311, "learning_rate": 0.001, "loss": 1.8084, "step": 876100 }, { "epoch": 51.24875709188746, "grad_norm": 0.16864724457263947, "learning_rate": 0.001, "loss": 1.8044, "step": 876200 }, { "epoch": 51.254606071240566, "grad_norm": 0.1685747653245926, "learning_rate": 0.001, "loss": 1.801, "step": 876300 }, { "epoch": 51.26045505059367, "grad_norm": 0.19749370217323303, "learning_rate": 0.001, "loss": 1.8011, "step": 876400 }, { "epoch": 51.266304029946774, "grad_norm": 0.16772110760211945, "learning_rate": 0.001, "loss": 1.806, "step": 876500 }, { "epoch": 51.27215300929988, "grad_norm": 0.15950289368629456, "learning_rate": 0.001, "loss": 1.8038, "step": 876600 }, { "epoch": 51.27800198865298, "grad_norm": 0.1405908316373825, "learning_rate": 0.001, "loss": 1.8021, "step": 876700 }, { "epoch": 51.283850968006085, "grad_norm": 0.17989808320999146, "learning_rate": 0.001, "loss": 1.804, "step": 876800 }, { "epoch": 51.28969994735919, "grad_norm": 0.20824119448661804, "learning_rate": 0.001, "loss": 1.8007, "step": 876900 }, { "epoch": 51.295548926712286, "grad_norm": 0.15302012860774994, "learning_rate": 0.001, "loss": 1.8016, "step": 877000 }, { "epoch": 51.30139790606539, "grad_norm": 0.18358002603054047, "learning_rate": 0.001, "loss": 1.8013, "step": 877100 }, { "epoch": 51.30724688541849, "grad_norm": 0.2569366693496704, "learning_rate": 0.001, "loss": 1.8036, "step": 877200 }, { "epoch": 51.3130958647716, "grad_norm": 0.1285179853439331, "learning_rate": 0.001, "loss": 1.8105, "step": 877300 }, { "epoch": 51.3189448441247, "grad_norm": 0.13563640415668488, "learning_rate": 0.001, "loss": 1.8044, "step": 877400 }, { "epoch": 51.324793823477805, "grad_norm": 0.18540722131729126, "learning_rate": 0.001, "loss": 1.8019, "step": 877500 }, { "epoch": 51.33064280283091, "grad_norm": 0.207391619682312, "learning_rate": 0.001, "loss": 1.8023, "step": 877600 }, { "epoch": 51.33649178218401, "grad_norm": 0.1706913560628891, "learning_rate": 0.001, "loss": 1.8038, "step": 877700 }, { "epoch": 51.34234076153711, "grad_norm": 0.21262793242931366, "learning_rate": 0.001, "loss": 1.8034, "step": 877800 }, { "epoch": 51.34818974089021, "grad_norm": 0.24157056212425232, "learning_rate": 0.001, "loss": 1.8007, "step": 877900 }, { "epoch": 51.35403872024332, "grad_norm": 0.20690438151359558, "learning_rate": 0.001, "loss": 1.8056, "step": 878000 }, { "epoch": 51.35988769959642, "grad_norm": 0.16560083627700806, "learning_rate": 0.001, "loss": 1.8106, "step": 878100 }, { "epoch": 51.365736678949524, "grad_norm": 0.19172585010528564, "learning_rate": 0.001, "loss": 1.8074, "step": 878200 }, { "epoch": 51.37158565830263, "grad_norm": 0.20749396085739136, "learning_rate": 0.001, "loss": 1.8125, "step": 878300 }, { "epoch": 51.37743463765573, "grad_norm": 0.14684826135635376, "learning_rate": 0.001, "loss": 1.8051, "step": 878400 }, { "epoch": 51.38328361700883, "grad_norm": 0.1602712869644165, "learning_rate": 0.001, "loss": 1.8061, "step": 878500 }, { "epoch": 51.38913259636193, "grad_norm": 0.1929173320531845, "learning_rate": 0.001, "loss": 1.8077, "step": 878600 }, { "epoch": 51.394981575715036, "grad_norm": 0.18950119614601135, "learning_rate": 0.001, "loss": 1.8058, "step": 878700 }, { "epoch": 51.40083055506814, "grad_norm": 0.22408606112003326, "learning_rate": 0.001, "loss": 1.8089, "step": 878800 }, { "epoch": 51.406679534421244, "grad_norm": 0.2987090051174164, "learning_rate": 0.001, "loss": 1.8047, "step": 878900 }, { "epoch": 51.41252851377435, "grad_norm": 0.21957342326641083, "learning_rate": 0.001, "loss": 1.8079, "step": 879000 }, { "epoch": 51.41837749312745, "grad_norm": 0.17575602233409882, "learning_rate": 0.001, "loss": 1.8066, "step": 879100 }, { "epoch": 51.424226472480555, "grad_norm": 0.21395762264728546, "learning_rate": 0.001, "loss": 1.805, "step": 879200 }, { "epoch": 51.43007545183365, "grad_norm": 0.13352729380130768, "learning_rate": 0.001, "loss": 1.805, "step": 879300 }, { "epoch": 51.435924431186756, "grad_norm": 0.1894058883190155, "learning_rate": 0.001, "loss": 1.8076, "step": 879400 }, { "epoch": 51.44177341053986, "grad_norm": 0.16993030905723572, "learning_rate": 0.001, "loss": 1.8063, "step": 879500 }, { "epoch": 51.44762238989296, "grad_norm": 0.20341327786445618, "learning_rate": 0.001, "loss": 1.8094, "step": 879600 }, { "epoch": 51.45347136924607, "grad_norm": 0.20773421227931976, "learning_rate": 0.001, "loss": 1.8118, "step": 879700 }, { "epoch": 51.45932034859917, "grad_norm": 0.2510800063610077, "learning_rate": 0.001, "loss": 1.8074, "step": 879800 }, { "epoch": 51.465169327952275, "grad_norm": 0.14815083146095276, "learning_rate": 0.001, "loss": 1.8104, "step": 879900 }, { "epoch": 51.47101830730538, "grad_norm": 0.22874557971954346, "learning_rate": 0.001, "loss": 1.8116, "step": 880000 }, { "epoch": 51.476867286658475, "grad_norm": 0.21247361600399017, "learning_rate": 0.001, "loss": 1.8127, "step": 880100 }, { "epoch": 51.48271626601158, "grad_norm": 0.13129264116287231, "learning_rate": 0.001, "loss": 1.8089, "step": 880200 }, { "epoch": 51.48856524536468, "grad_norm": 0.1500626653432846, "learning_rate": 0.001, "loss": 1.8091, "step": 880300 }, { "epoch": 51.49441422471779, "grad_norm": 0.15279638767242432, "learning_rate": 0.001, "loss": 1.8056, "step": 880400 }, { "epoch": 51.50026320407089, "grad_norm": 0.16999012231826782, "learning_rate": 0.001, "loss": 1.806, "step": 880500 }, { "epoch": 51.506112183423994, "grad_norm": 0.13631129264831543, "learning_rate": 0.001, "loss": 1.805, "step": 880600 }, { "epoch": 51.5119611627771, "grad_norm": 0.18149955570697784, "learning_rate": 0.001, "loss": 1.8043, "step": 880700 }, { "epoch": 51.5178101421302, "grad_norm": 0.2003648728132248, "learning_rate": 0.001, "loss": 1.8096, "step": 880800 }, { "epoch": 51.5236591214833, "grad_norm": 0.17695066332817078, "learning_rate": 0.001, "loss": 1.8062, "step": 880900 }, { "epoch": 51.5295081008364, "grad_norm": 0.15819108486175537, "learning_rate": 0.001, "loss": 1.8042, "step": 881000 }, { "epoch": 51.535357080189506, "grad_norm": 0.18313166499137878, "learning_rate": 0.001, "loss": 1.8097, "step": 881100 }, { "epoch": 51.54120605954261, "grad_norm": 0.20750780403614044, "learning_rate": 0.001, "loss": 1.8065, "step": 881200 }, { "epoch": 51.547055038895714, "grad_norm": 0.1617942601442337, "learning_rate": 0.001, "loss": 1.8039, "step": 881300 }, { "epoch": 51.55290401824882, "grad_norm": 0.24801696836948395, "learning_rate": 0.001, "loss": 1.8114, "step": 881400 }, { "epoch": 51.55875299760192, "grad_norm": 0.17774510383605957, "learning_rate": 0.001, "loss": 1.8069, "step": 881500 }, { "epoch": 51.56460197695502, "grad_norm": 0.179667130112648, "learning_rate": 0.001, "loss": 1.8086, "step": 881600 }, { "epoch": 51.57045095630812, "grad_norm": 0.16070520877838135, "learning_rate": 0.001, "loss": 1.8113, "step": 881700 }, { "epoch": 51.576299935661226, "grad_norm": 0.19071601331233978, "learning_rate": 0.001, "loss": 1.8079, "step": 881800 }, { "epoch": 51.58214891501433, "grad_norm": 0.14228226244449615, "learning_rate": 0.001, "loss": 1.8072, "step": 881900 }, { "epoch": 51.58799789436743, "grad_norm": 0.1490820199251175, "learning_rate": 0.001, "loss": 1.8056, "step": 882000 }, { "epoch": 51.59384687372054, "grad_norm": 0.13294221460819244, "learning_rate": 0.001, "loss": 1.81, "step": 882100 }, { "epoch": 51.59969585307364, "grad_norm": 0.2436077743768692, "learning_rate": 0.001, "loss": 1.8076, "step": 882200 }, { "epoch": 51.605544832426745, "grad_norm": 0.32087472081184387, "learning_rate": 0.001, "loss": 1.8155, "step": 882300 }, { "epoch": 51.61139381177984, "grad_norm": 0.2226729542016983, "learning_rate": 0.001, "loss": 1.808, "step": 882400 }, { "epoch": 51.617242791132945, "grad_norm": 0.15937480330467224, "learning_rate": 0.001, "loss": 1.8105, "step": 882500 }, { "epoch": 51.62309177048605, "grad_norm": 0.15582630038261414, "learning_rate": 0.001, "loss": 1.8041, "step": 882600 }, { "epoch": 51.62894074983915, "grad_norm": 0.22518129646778107, "learning_rate": 0.001, "loss": 1.8041, "step": 882700 }, { "epoch": 51.634789729192256, "grad_norm": 0.16215921938419342, "learning_rate": 0.001, "loss": 1.8107, "step": 882800 }, { "epoch": 51.64063870854536, "grad_norm": 0.26551908254623413, "learning_rate": 0.001, "loss": 1.8008, "step": 882900 }, { "epoch": 51.646487687898464, "grad_norm": 0.17515534162521362, "learning_rate": 0.001, "loss": 1.8097, "step": 883000 }, { "epoch": 51.65233666725157, "grad_norm": 0.1988375037908554, "learning_rate": 0.001, "loss": 1.8026, "step": 883100 }, { "epoch": 51.658185646604664, "grad_norm": 0.13383466005325317, "learning_rate": 0.001, "loss": 1.8077, "step": 883200 }, { "epoch": 51.66403462595777, "grad_norm": 0.20405808091163635, "learning_rate": 0.001, "loss": 1.8034, "step": 883300 }, { "epoch": 51.66988360531087, "grad_norm": 0.16082124412059784, "learning_rate": 0.001, "loss": 1.8124, "step": 883400 }, { "epoch": 51.675732584663976, "grad_norm": 0.17824575304985046, "learning_rate": 0.001, "loss": 1.8057, "step": 883500 }, { "epoch": 51.68158156401708, "grad_norm": 0.17911703884601593, "learning_rate": 0.001, "loss": 1.805, "step": 883600 }, { "epoch": 51.68743054337018, "grad_norm": 0.2910013198852539, "learning_rate": 0.001, "loss": 1.8048, "step": 883700 }, { "epoch": 51.69327952272329, "grad_norm": 0.1522534042596817, "learning_rate": 0.001, "loss": 1.8114, "step": 883800 }, { "epoch": 51.69912850207639, "grad_norm": 0.16743573546409607, "learning_rate": 0.001, "loss": 1.809, "step": 883900 }, { "epoch": 51.70497748142949, "grad_norm": 0.17205405235290527, "learning_rate": 0.001, "loss": 1.8109, "step": 884000 }, { "epoch": 51.71082646078259, "grad_norm": 0.19488073885440826, "learning_rate": 0.001, "loss": 1.8121, "step": 884100 }, { "epoch": 51.716675440135695, "grad_norm": 0.16868051886558533, "learning_rate": 0.001, "loss": 1.8047, "step": 884200 }, { "epoch": 51.7225244194888, "grad_norm": 0.1603543758392334, "learning_rate": 0.001, "loss": 1.8076, "step": 884300 }, { "epoch": 51.7283733988419, "grad_norm": 0.185296893119812, "learning_rate": 0.001, "loss": 1.8105, "step": 884400 }, { "epoch": 51.73422237819501, "grad_norm": 0.20811046659946442, "learning_rate": 0.001, "loss": 1.8083, "step": 884500 }, { "epoch": 51.74007135754811, "grad_norm": 0.13591718673706055, "learning_rate": 0.001, "loss": 1.8117, "step": 884600 }, { "epoch": 51.74592033690121, "grad_norm": 0.19925318658351898, "learning_rate": 0.001, "loss": 1.8143, "step": 884700 }, { "epoch": 51.75176931625431, "grad_norm": 0.12867824733257294, "learning_rate": 0.001, "loss": 1.8036, "step": 884800 }, { "epoch": 51.757618295607415, "grad_norm": 0.19069348275661469, "learning_rate": 0.001, "loss": 1.8063, "step": 884900 }, { "epoch": 51.76346727496052, "grad_norm": 0.15589846670627594, "learning_rate": 0.001, "loss": 1.8038, "step": 885000 }, { "epoch": 51.76931625431362, "grad_norm": 0.20621366798877716, "learning_rate": 0.001, "loss": 1.8076, "step": 885100 }, { "epoch": 51.775165233666726, "grad_norm": 0.15122565627098083, "learning_rate": 0.001, "loss": 1.8064, "step": 885200 }, { "epoch": 51.78101421301983, "grad_norm": 0.2362280935049057, "learning_rate": 0.001, "loss": 1.8122, "step": 885300 }, { "epoch": 51.786863192372934, "grad_norm": 0.15634937584400177, "learning_rate": 0.001, "loss": 1.8089, "step": 885400 }, { "epoch": 51.79271217172603, "grad_norm": 0.1404545158147812, "learning_rate": 0.001, "loss": 1.8096, "step": 885500 }, { "epoch": 51.798561151079134, "grad_norm": 0.19265612959861755, "learning_rate": 0.001, "loss": 1.8149, "step": 885600 }, { "epoch": 51.80441013043224, "grad_norm": 0.1481974720954895, "learning_rate": 0.001, "loss": 1.8049, "step": 885700 }, { "epoch": 51.81025910978534, "grad_norm": 0.1743660271167755, "learning_rate": 0.001, "loss": 1.8097, "step": 885800 }, { "epoch": 51.816108089138446, "grad_norm": 0.18547415733337402, "learning_rate": 0.001, "loss": 1.8131, "step": 885900 }, { "epoch": 51.82195706849155, "grad_norm": 0.1909300684928894, "learning_rate": 0.001, "loss": 1.8153, "step": 886000 }, { "epoch": 51.82780604784465, "grad_norm": 0.19818070530891418, "learning_rate": 0.001, "loss": 1.8117, "step": 886100 }, { "epoch": 51.83365502719776, "grad_norm": 0.19636845588684082, "learning_rate": 0.001, "loss": 1.8127, "step": 886200 }, { "epoch": 51.839504006550854, "grad_norm": 0.25413817167282104, "learning_rate": 0.001, "loss": 1.808, "step": 886300 }, { "epoch": 51.84535298590396, "grad_norm": 0.16155223548412323, "learning_rate": 0.001, "loss": 1.8124, "step": 886400 }, { "epoch": 51.85120196525706, "grad_norm": 0.15942826867103577, "learning_rate": 0.001, "loss": 1.8045, "step": 886500 }, { "epoch": 51.857050944610165, "grad_norm": 0.1998773068189621, "learning_rate": 0.001, "loss": 1.8067, "step": 886600 }, { "epoch": 51.86289992396327, "grad_norm": 0.13520662486553192, "learning_rate": 0.001, "loss": 1.8126, "step": 886700 }, { "epoch": 51.86874890331637, "grad_norm": 0.16253216564655304, "learning_rate": 0.001, "loss": 1.8048, "step": 886800 }, { "epoch": 51.87459788266948, "grad_norm": 0.19232453405857086, "learning_rate": 0.001, "loss": 1.8104, "step": 886900 }, { "epoch": 51.88044686202258, "grad_norm": 0.18694257736206055, "learning_rate": 0.001, "loss": 1.8068, "step": 887000 }, { "epoch": 51.88629584137568, "grad_norm": 0.1702011674642563, "learning_rate": 0.001, "loss": 1.8115, "step": 887100 }, { "epoch": 51.89214482072878, "grad_norm": 0.1928698569536209, "learning_rate": 0.001, "loss": 1.8078, "step": 887200 }, { "epoch": 51.897993800081885, "grad_norm": 0.1806461364030838, "learning_rate": 0.001, "loss": 1.8128, "step": 887300 }, { "epoch": 51.90384277943499, "grad_norm": 0.240864098072052, "learning_rate": 0.001, "loss": 1.813, "step": 887400 }, { "epoch": 51.90969175878809, "grad_norm": 0.1818540245294571, "learning_rate": 0.001, "loss": 1.8081, "step": 887500 }, { "epoch": 51.915540738141196, "grad_norm": 0.13136287033557892, "learning_rate": 0.001, "loss": 1.8107, "step": 887600 }, { "epoch": 51.9213897174943, "grad_norm": 0.27322083711624146, "learning_rate": 0.001, "loss": 1.8115, "step": 887700 }, { "epoch": 51.9272386968474, "grad_norm": 0.15490491688251495, "learning_rate": 0.001, "loss": 1.8128, "step": 887800 }, { "epoch": 51.9330876762005, "grad_norm": 0.18472762405872345, "learning_rate": 0.001, "loss": 1.8122, "step": 887900 }, { "epoch": 51.938936655553604, "grad_norm": 0.20472033321857452, "learning_rate": 0.001, "loss": 1.8119, "step": 888000 }, { "epoch": 51.94478563490671, "grad_norm": 0.19963869452476501, "learning_rate": 0.001, "loss": 1.8054, "step": 888100 }, { "epoch": 51.95063461425981, "grad_norm": 0.16050413250923157, "learning_rate": 0.001, "loss": 1.8105, "step": 888200 }, { "epoch": 51.956483593612916, "grad_norm": 0.2067427635192871, "learning_rate": 0.001, "loss": 1.8139, "step": 888300 }, { "epoch": 51.96233257296602, "grad_norm": 0.13751859962940216, "learning_rate": 0.001, "loss": 1.8126, "step": 888400 }, { "epoch": 51.96818155231912, "grad_norm": 0.14502233266830444, "learning_rate": 0.001, "loss": 1.8113, "step": 888500 }, { "epoch": 51.97403053167222, "grad_norm": 0.15831217169761658, "learning_rate": 0.001, "loss": 1.8179, "step": 888600 }, { "epoch": 51.979879511025324, "grad_norm": 0.15535563230514526, "learning_rate": 0.001, "loss": 1.8115, "step": 888700 }, { "epoch": 51.98572849037843, "grad_norm": 0.15276260673999786, "learning_rate": 0.001, "loss": 1.8071, "step": 888800 }, { "epoch": 51.99157746973153, "grad_norm": 0.30983954668045044, "learning_rate": 0.001, "loss": 1.8122, "step": 888900 }, { "epoch": 51.997426449084635, "grad_norm": 0.18341206014156342, "learning_rate": 0.001, "loss": 1.8157, "step": 889000 }, { "epoch": 52.00327542843774, "grad_norm": 0.14328673481941223, "learning_rate": 0.001, "loss": 1.8067, "step": 889100 }, { "epoch": 52.00912440779084, "grad_norm": 0.18995757400989532, "learning_rate": 0.001, "loss": 1.8004, "step": 889200 }, { "epoch": 52.01497338714395, "grad_norm": 0.16683132946491241, "learning_rate": 0.001, "loss": 1.7953, "step": 889300 }, { "epoch": 52.02082236649704, "grad_norm": 0.1512465924024582, "learning_rate": 0.001, "loss": 1.7943, "step": 889400 }, { "epoch": 52.02667134585015, "grad_norm": 0.19179663062095642, "learning_rate": 0.001, "loss": 1.7935, "step": 889500 }, { "epoch": 52.03252032520325, "grad_norm": 0.22810982167720795, "learning_rate": 0.001, "loss": 1.7943, "step": 889600 }, { "epoch": 52.038369304556355, "grad_norm": 0.13426287472248077, "learning_rate": 0.001, "loss": 1.7964, "step": 889700 }, { "epoch": 52.04421828390946, "grad_norm": 0.1568271517753601, "learning_rate": 0.001, "loss": 1.8012, "step": 889800 }, { "epoch": 52.05006726326256, "grad_norm": 0.17983028292655945, "learning_rate": 0.001, "loss": 1.796, "step": 889900 }, { "epoch": 52.055916242615666, "grad_norm": 0.15561749041080475, "learning_rate": 0.001, "loss": 1.7958, "step": 890000 }, { "epoch": 52.06176522196877, "grad_norm": 0.1461886763572693, "learning_rate": 0.001, "loss": 1.8003, "step": 890100 }, { "epoch": 52.06761420132187, "grad_norm": 0.1833336353302002, "learning_rate": 0.001, "loss": 1.7976, "step": 890200 }, { "epoch": 52.07346318067497, "grad_norm": 0.14721395075321198, "learning_rate": 0.001, "loss": 1.7995, "step": 890300 }, { "epoch": 52.079312160028074, "grad_norm": 0.18152008950710297, "learning_rate": 0.001, "loss": 1.8034, "step": 890400 }, { "epoch": 52.08516113938118, "grad_norm": 0.15213795006275177, "learning_rate": 0.001, "loss": 1.8012, "step": 890500 }, { "epoch": 52.09101011873428, "grad_norm": 0.16971497237682343, "learning_rate": 0.001, "loss": 1.7963, "step": 890600 }, { "epoch": 52.096859098087386, "grad_norm": 0.17272797226905823, "learning_rate": 0.001, "loss": 1.7975, "step": 890700 }, { "epoch": 52.10270807744049, "grad_norm": 0.15097036957740784, "learning_rate": 0.001, "loss": 1.8003, "step": 890800 }, { "epoch": 52.108557056793586, "grad_norm": 0.24161109328269958, "learning_rate": 0.001, "loss": 1.8035, "step": 890900 }, { "epoch": 52.11440603614669, "grad_norm": 0.17088554799556732, "learning_rate": 0.001, "loss": 1.8024, "step": 891000 }, { "epoch": 52.120255015499794, "grad_norm": 0.18200162053108215, "learning_rate": 0.001, "loss": 1.8004, "step": 891100 }, { "epoch": 52.1261039948529, "grad_norm": 0.18356232345104218, "learning_rate": 0.001, "loss": 1.8007, "step": 891200 }, { "epoch": 52.131952974206, "grad_norm": 0.18932987749576569, "learning_rate": 0.001, "loss": 1.7938, "step": 891300 }, { "epoch": 52.137801953559105, "grad_norm": 0.17104561626911163, "learning_rate": 0.001, "loss": 1.7994, "step": 891400 }, { "epoch": 52.14365093291221, "grad_norm": 0.1758268177509308, "learning_rate": 0.001, "loss": 1.8013, "step": 891500 }, { "epoch": 52.14949991226531, "grad_norm": 0.1770586371421814, "learning_rate": 0.001, "loss": 1.7944, "step": 891600 }, { "epoch": 52.15534889161841, "grad_norm": 0.18779578804969788, "learning_rate": 0.001, "loss": 1.8029, "step": 891700 }, { "epoch": 52.16119787097151, "grad_norm": 0.16355839371681213, "learning_rate": 0.001, "loss": 1.8009, "step": 891800 }, { "epoch": 52.16704685032462, "grad_norm": 0.1613030582666397, "learning_rate": 0.001, "loss": 1.796, "step": 891900 }, { "epoch": 52.17289582967772, "grad_norm": 0.16369742155075073, "learning_rate": 0.001, "loss": 1.7952, "step": 892000 }, { "epoch": 52.178744809030825, "grad_norm": 0.19008806347846985, "learning_rate": 0.001, "loss": 1.8046, "step": 892100 }, { "epoch": 52.18459378838393, "grad_norm": 0.1849314570426941, "learning_rate": 0.001, "loss": 1.8025, "step": 892200 }, { "epoch": 52.19044276773703, "grad_norm": 0.19698062539100647, "learning_rate": 0.001, "loss": 1.7999, "step": 892300 }, { "epoch": 52.196291747090136, "grad_norm": 0.19367779791355133, "learning_rate": 0.001, "loss": 1.7992, "step": 892400 }, { "epoch": 52.20214072644323, "grad_norm": 0.1654088944196701, "learning_rate": 0.001, "loss": 1.802, "step": 892500 }, { "epoch": 52.20798970579634, "grad_norm": 0.17361867427825928, "learning_rate": 0.001, "loss": 1.8029, "step": 892600 }, { "epoch": 52.21383868514944, "grad_norm": 0.2436315268278122, "learning_rate": 0.001, "loss": 1.8063, "step": 892700 }, { "epoch": 52.219687664502544, "grad_norm": 0.23857565224170685, "learning_rate": 0.001, "loss": 1.8039, "step": 892800 }, { "epoch": 52.22553664385565, "grad_norm": 0.2310713678598404, "learning_rate": 0.001, "loss": 1.803, "step": 892900 }, { "epoch": 52.23138562320875, "grad_norm": 0.19927741587162018, "learning_rate": 0.001, "loss": 1.8038, "step": 893000 }, { "epoch": 52.237234602561855, "grad_norm": 0.15141575038433075, "learning_rate": 0.001, "loss": 1.7998, "step": 893100 }, { "epoch": 52.24308358191496, "grad_norm": 0.1420348882675171, "learning_rate": 0.001, "loss": 1.8062, "step": 893200 }, { "epoch": 52.248932561268056, "grad_norm": 0.17246268689632416, "learning_rate": 0.001, "loss": 1.8053, "step": 893300 }, { "epoch": 52.25478154062116, "grad_norm": 0.197114959359169, "learning_rate": 0.001, "loss": 1.8048, "step": 893400 }, { "epoch": 52.260630519974264, "grad_norm": 0.20755186676979065, "learning_rate": 0.001, "loss": 1.8059, "step": 893500 }, { "epoch": 52.26647949932737, "grad_norm": 0.2018871009349823, "learning_rate": 0.001, "loss": 1.8037, "step": 893600 }, { "epoch": 52.27232847868047, "grad_norm": 0.1816786825656891, "learning_rate": 0.001, "loss": 1.8032, "step": 893700 }, { "epoch": 52.278177458033575, "grad_norm": 0.1644948571920395, "learning_rate": 0.001, "loss": 1.7998, "step": 893800 }, { "epoch": 52.28402643738668, "grad_norm": 0.23536615073680878, "learning_rate": 0.001, "loss": 1.8008, "step": 893900 }, { "epoch": 52.289875416739775, "grad_norm": 0.1611543446779251, "learning_rate": 0.001, "loss": 1.8051, "step": 894000 }, { "epoch": 52.29572439609288, "grad_norm": 0.16649129986763, "learning_rate": 0.001, "loss": 1.801, "step": 894100 }, { "epoch": 52.30157337544598, "grad_norm": 0.22057658433914185, "learning_rate": 0.001, "loss": 1.8051, "step": 894200 }, { "epoch": 52.30742235479909, "grad_norm": 0.174124613404274, "learning_rate": 0.001, "loss": 1.8081, "step": 894300 }, { "epoch": 52.31327133415219, "grad_norm": 0.19193018972873688, "learning_rate": 0.001, "loss": 1.7961, "step": 894400 }, { "epoch": 52.319120313505294, "grad_norm": 0.21135130524635315, "learning_rate": 0.001, "loss": 1.8058, "step": 894500 }, { "epoch": 52.3249692928584, "grad_norm": 0.19484391808509827, "learning_rate": 0.001, "loss": 1.8055, "step": 894600 }, { "epoch": 52.3308182722115, "grad_norm": 0.16396819055080414, "learning_rate": 0.001, "loss": 1.8057, "step": 894700 }, { "epoch": 52.3366672515646, "grad_norm": 0.15612204372882843, "learning_rate": 0.001, "loss": 1.8021, "step": 894800 }, { "epoch": 52.3425162309177, "grad_norm": 0.17413881421089172, "learning_rate": 0.001, "loss": 1.7999, "step": 894900 }, { "epoch": 52.348365210270806, "grad_norm": 0.202262282371521, "learning_rate": 0.001, "loss": 1.8113, "step": 895000 }, { "epoch": 52.35421418962391, "grad_norm": 0.1663033366203308, "learning_rate": 0.001, "loss": 1.7945, "step": 895100 }, { "epoch": 52.360063168977014, "grad_norm": 0.2462836652994156, "learning_rate": 0.001, "loss": 1.8035, "step": 895200 }, { "epoch": 52.36591214833012, "grad_norm": 0.22994764149188995, "learning_rate": 0.001, "loss": 1.8027, "step": 895300 }, { "epoch": 52.37176112768322, "grad_norm": 0.2380063533782959, "learning_rate": 0.001, "loss": 1.8083, "step": 895400 }, { "epoch": 52.377610107036325, "grad_norm": 0.1614125370979309, "learning_rate": 0.001, "loss": 1.8075, "step": 895500 }, { "epoch": 52.38345908638942, "grad_norm": 0.17426446080207825, "learning_rate": 0.001, "loss": 1.803, "step": 895600 }, { "epoch": 52.389308065742526, "grad_norm": 0.18043965101242065, "learning_rate": 0.001, "loss": 1.8055, "step": 895700 }, { "epoch": 52.39515704509563, "grad_norm": 0.19772277772426605, "learning_rate": 0.001, "loss": 1.8047, "step": 895800 }, { "epoch": 52.40100602444873, "grad_norm": 0.17327363789081573, "learning_rate": 0.001, "loss": 1.799, "step": 895900 }, { "epoch": 52.40685500380184, "grad_norm": 0.1693907231092453, "learning_rate": 0.001, "loss": 1.8016, "step": 896000 }, { "epoch": 52.41270398315494, "grad_norm": 0.21210241317749023, "learning_rate": 0.001, "loss": 1.8094, "step": 896100 }, { "epoch": 52.418552962508045, "grad_norm": 0.1504185050725937, "learning_rate": 0.001, "loss": 1.8022, "step": 896200 }, { "epoch": 52.42440194186115, "grad_norm": 0.18392056226730347, "learning_rate": 0.001, "loss": 1.8037, "step": 896300 }, { "epoch": 52.430250921214245, "grad_norm": 0.2586664855480194, "learning_rate": 0.001, "loss": 1.8054, "step": 896400 }, { "epoch": 52.43609990056735, "grad_norm": 0.1983998566865921, "learning_rate": 0.001, "loss": 1.8008, "step": 896500 }, { "epoch": 52.44194887992045, "grad_norm": 0.18540629744529724, "learning_rate": 0.001, "loss": 1.7986, "step": 896600 }, { "epoch": 52.44779785927356, "grad_norm": 0.1562522053718567, "learning_rate": 0.001, "loss": 1.805, "step": 896700 }, { "epoch": 52.45364683862666, "grad_norm": 0.2209227830171585, "learning_rate": 0.001, "loss": 1.8022, "step": 896800 }, { "epoch": 52.459495817979764, "grad_norm": 0.21116550266742706, "learning_rate": 0.001, "loss": 1.8014, "step": 896900 }, { "epoch": 52.46534479733287, "grad_norm": 0.17448696494102478, "learning_rate": 0.001, "loss": 1.8034, "step": 897000 }, { "epoch": 52.471193776685965, "grad_norm": 0.18877795338630676, "learning_rate": 0.001, "loss": 1.8057, "step": 897100 }, { "epoch": 52.47704275603907, "grad_norm": 0.14922349154949188, "learning_rate": 0.001, "loss": 1.8071, "step": 897200 }, { "epoch": 52.48289173539217, "grad_norm": 0.14811182022094727, "learning_rate": 0.001, "loss": 1.8068, "step": 897300 }, { "epoch": 52.488740714745276, "grad_norm": 0.19664491713047028, "learning_rate": 0.001, "loss": 1.802, "step": 897400 }, { "epoch": 52.49458969409838, "grad_norm": 0.1673753261566162, "learning_rate": 0.001, "loss": 1.8074, "step": 897500 }, { "epoch": 52.500438673451484, "grad_norm": 0.16865193843841553, "learning_rate": 0.001, "loss": 1.8106, "step": 897600 }, { "epoch": 52.50628765280459, "grad_norm": 0.199922576546669, "learning_rate": 0.001, "loss": 1.8092, "step": 897700 }, { "epoch": 52.51213663215769, "grad_norm": 0.198725163936615, "learning_rate": 0.001, "loss": 1.8052, "step": 897800 }, { "epoch": 52.51798561151079, "grad_norm": 0.1940106451511383, "learning_rate": 0.001, "loss": 1.804, "step": 897900 }, { "epoch": 52.52383459086389, "grad_norm": 0.2309243530035019, "learning_rate": 0.001, "loss": 1.8102, "step": 898000 }, { "epoch": 52.529683570216996, "grad_norm": 0.21361203491687775, "learning_rate": 0.001, "loss": 1.8103, "step": 898100 }, { "epoch": 52.5355325495701, "grad_norm": 0.16608218848705292, "learning_rate": 0.001, "loss": 1.8069, "step": 898200 }, { "epoch": 52.5413815289232, "grad_norm": 0.22694551944732666, "learning_rate": 0.001, "loss": 1.8047, "step": 898300 }, { "epoch": 52.54723050827631, "grad_norm": 0.15396186709403992, "learning_rate": 0.001, "loss": 1.7999, "step": 898400 }, { "epoch": 52.55307948762941, "grad_norm": 0.21625152230262756, "learning_rate": 0.001, "loss": 1.805, "step": 898500 }, { "epoch": 52.558928466982515, "grad_norm": 0.16820430755615234, "learning_rate": 0.001, "loss": 1.8072, "step": 898600 }, { "epoch": 52.56477744633561, "grad_norm": 0.16570737957954407, "learning_rate": 0.001, "loss": 1.8039, "step": 898700 }, { "epoch": 52.570626425688715, "grad_norm": 0.15018481016159058, "learning_rate": 0.001, "loss": 1.8086, "step": 898800 }, { "epoch": 52.57647540504182, "grad_norm": 0.17899346351623535, "learning_rate": 0.001, "loss": 1.8042, "step": 898900 }, { "epoch": 52.58232438439492, "grad_norm": 0.15854109823703766, "learning_rate": 0.001, "loss": 1.8078, "step": 899000 }, { "epoch": 52.58817336374803, "grad_norm": 0.1653020679950714, "learning_rate": 0.001, "loss": 1.8039, "step": 899100 }, { "epoch": 52.59402234310113, "grad_norm": 0.16156721115112305, "learning_rate": 0.001, "loss": 1.8051, "step": 899200 }, { "epoch": 52.599871322454234, "grad_norm": 0.16153182089328766, "learning_rate": 0.001, "loss": 1.8122, "step": 899300 }, { "epoch": 52.60572030180734, "grad_norm": 0.18100517988204956, "learning_rate": 0.001, "loss": 1.8074, "step": 899400 }, { "epoch": 52.611569281160435, "grad_norm": 0.15912459790706635, "learning_rate": 0.001, "loss": 1.8061, "step": 899500 }, { "epoch": 52.61741826051354, "grad_norm": 0.18259337544441223, "learning_rate": 0.001, "loss": 1.8095, "step": 899600 }, { "epoch": 52.62326723986664, "grad_norm": 0.16774894297122955, "learning_rate": 0.001, "loss": 1.8045, "step": 899700 }, { "epoch": 52.629116219219746, "grad_norm": 0.18401765823364258, "learning_rate": 0.001, "loss": 1.8083, "step": 899800 }, { "epoch": 52.63496519857285, "grad_norm": 0.2041786015033722, "learning_rate": 0.001, "loss": 1.8019, "step": 899900 }, { "epoch": 52.640814177925954, "grad_norm": 0.15415909886360168, "learning_rate": 0.001, "loss": 1.8009, "step": 900000 }, { "epoch": 52.640814177925954, "eval_ag_news_accuracy": 0.24553125, "eval_ag_news_bleu_score": 6.882669517106709, "eval_ag_news_bleu_score_sem": 0.496357654624534, "eval_ag_news_emb_cos_sim": 0.7232034206390381, "eval_ag_news_emb_cos_sim_sem": 0.01479930616915226, "eval_ag_news_emb_top1_equal": 0.9375, "eval_ag_news_emb_top1_equal_sem": 0.02147948183119297, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6946513652801514, "eval_ag_news_n_ngrams_match_1": 13.7265625, "eval_ag_news_n_ngrams_match_2": 3.859375, "eval_ag_news_n_ngrams_match_3": 1.4296875, "eval_ag_news_num_pred_words": 44.5859375, "eval_ag_news_num_true_words": 45.5546875, "eval_ag_news_perplexity": 14.800357935077438, "eval_ag_news_pred_num_tokens": 68.5, "eval_ag_news_rouge_score": 0.29178235049700374, "eval_ag_news_runtime": 36.3883, "eval_ag_news_samples_per_second": 13.741, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3255136078815034, "eval_ag_news_token_set_f1_sem": 0.009323227376790666, "eval_ag_news_token_set_precision": 0.3076007997204784, "eval_ag_news_token_set_recall": 0.3543307029261053, "eval_ag_news_true_num_tokens": 62.3828125, "step": 900000 }, { "epoch": 52.640814177925954, "eval_anthropic_toxic_prompts_accuracy": 0.105046875, "eval_anthropic_toxic_prompts_bleu_score": 39.41168523294593, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.566573408653983, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8966080546379089, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.00899407360702753, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02854125206846796, "eval_anthropic_toxic_prompts_loss": 1.2688974142074585, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.796875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.84375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.671875, "eval_anthropic_toxic_prompts_num_pred_words": 15.7734375, "eval_anthropic_toxic_prompts_num_true_words": 16.296875, "eval_anthropic_toxic_prompts_perplexity": 3.5569285805717454, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.1953125, "eval_anthropic_toxic_prompts_rouge_score": 0.6788567382940534, "eval_anthropic_toxic_prompts_runtime": 29.2734, "eval_anthropic_toxic_prompts_samples_per_second": 17.08, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.6886609430004573, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017920641610590676, "eval_anthropic_toxic_prompts_token_set_precision": 0.6858640980991251, "eval_anthropic_toxic_prompts_token_set_recall": 0.6994782282309752, "eval_anthropic_toxic_prompts_true_num_tokens": 19.8515625, "step": 900000 }, { "epoch": 52.640814177925954, "eval_arxiv_accuracy": 0.37546875, "eval_arxiv_bleu_score": 1.8600133546708038, "eval_arxiv_bleu_score_sem": 0.17176275384328202, "eval_arxiv_emb_cos_sim": 0.5171769857406616, "eval_arxiv_emb_cos_sim_sem": 0.019052648916840553, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.425551652908325, "eval_arxiv_n_ngrams_match_1": 13.46875, "eval_arxiv_n_ngrams_match_2": 2.5078125, "eval_arxiv_n_ngrams_match_3": 0.5625, "eval_arxiv_num_pred_words": 54.5703125, "eval_arxiv_num_true_words": 85.015625, "eval_arxiv_perplexity": 30.73959776351837, "eval_arxiv_pred_num_tokens": 125.09375, "eval_arxiv_rouge_score": 0.1803391270278807, "eval_arxiv_runtime": 29.7063, "eval_arxiv_samples_per_second": 16.831, "eval_arxiv_steps_per_second": 0.034, "eval_arxiv_token_set_f1": 0.18751012344451237, "eval_arxiv_token_set_f1_sem": 0.009077489618000987, "eval_arxiv_token_set_precision": 0.13175392452572363, "eval_arxiv_token_set_recall": 0.4209651847397418, "eval_arxiv_true_num_tokens": 124.15625, "step": 900000 }, { "epoch": 52.640814177925954, "eval_python_code_alpaca_accuracy": 0.13265625, "eval_python_code_alpaca_bleu_score": 30.768307889504612, "eval_python_code_alpaca_bleu_score_sem": 1.7663464997911875, "eval_python_code_alpaca_emb_cos_sim": 0.8774175643920898, "eval_python_code_alpaca_emb_cos_sim_sem": 0.007962733507156372, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4499852657318115, "eval_python_code_alpaca_n_ngrams_match_1": 11.1328125, "eval_python_code_alpaca_n_ngrams_match_2": 6.4375, "eval_python_code_alpaca_n_ngrams_match_3": 3.8671875, "eval_python_code_alpaca_num_pred_words": 17.734375, "eval_python_code_alpaca_num_true_words": 19.5390625, "eval_python_code_alpaca_perplexity": 4.263051701758989, "eval_python_code_alpaca_pred_num_tokens": 23.2265625, "eval_python_code_alpaca_rouge_score": 0.6109338017865282, "eval_python_code_alpaca_runtime": 29.8326, "eval_python_code_alpaca_samples_per_second": 16.76, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6275453117850565, "eval_python_code_alpaca_token_set_f1_sem": 0.013996609478944178, "eval_python_code_alpaca_token_set_precision": 0.6046110777340681, "eval_python_code_alpaca_token_set_recall": 0.6581206209375563, "eval_python_code_alpaca_true_num_tokens": 24.7578125, "step": 900000 }, { "epoch": 52.640814177925954, "eval_wikibio_accuracy": 0.371578125, "eval_wikibio_bleu_score": 7.04904105810566, "eval_wikibio_bleu_score_sem": 0.6472665212095898, "eval_wikibio_emb_cos_sim": 0.595160186290741, "eval_wikibio_emb_cos_sim_sem": 0.02291226014494896, "eval_wikibio_emb_top1_equal": 0.9375, "eval_wikibio_emb_top1_equal_sem": 0.02147948183119297, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.711992025375366, "eval_wikibio_n_ngrams_match_1": 15.2421875, "eval_wikibio_n_ngrams_match_2": 5.1015625, "eval_wikibio_n_ngrams_match_3": 2.1171875, "eval_wikibio_num_pred_words": 52.1015625, "eval_wikibio_num_true_words": 53.0625, "eval_wikibio_perplexity": 15.059244052207811, "eval_wikibio_pred_num_tokens": 107.125, "eval_wikibio_rouge_score": 0.2952730453125292, "eval_wikibio_runtime": 30.1607, "eval_wikibio_samples_per_second": 16.578, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.30957371264847877, "eval_wikibio_token_set_f1_sem": 0.012130697842403147, "eval_wikibio_token_set_precision": 0.2744656601807375, "eval_wikibio_token_set_recall": 0.4070452830871894, "eval_wikibio_true_num_tokens": 100.9453125, "step": 900000 }, { "epoch": 52.640814177925954, "eval_msmarco_accuracy": 0.395875, "eval_msmarco_bleu_score": 17.588006217110408, "eval_msmarco_bleu_score_sem": 1.296159221066536, "eval_msmarco_emb_cos_sim": 0.8094558715820312, "eval_msmarco_emb_cos_sim_sem": 0.014332885853946209, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7804912328720093, "eval_msmarco_n_ngrams_match_1": 28.1484375, "eval_msmarco_n_ngrams_match_2": 12.921875, "eval_msmarco_n_ngrams_match_3": 7.3828125, "eval_msmarco_num_pred_words": 60.0390625, "eval_msmarco_num_true_words": 62.5859375, "eval_msmarco_perplexity": 5.93277007457346, "eval_msmarco_pred_num_tokens": 83.8203125, "eval_msmarco_rouge_score": 0.448804356205073, "eval_msmarco_runtime": 26.5274, "eval_msmarco_samples_per_second": 18.848, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.47206115785728664, "eval_msmarco_token_set_f1_sem": 0.014374032191592905, "eval_msmarco_token_set_precision": 0.4341662857470752, "eval_msmarco_token_set_recall": 0.5418268674821681, "eval_msmarco_true_num_tokens": 80.890625, "step": 900000 }, { "epoch": 52.64666315727906, "grad_norm": 0.1848738193511963, "learning_rate": 0.001, "loss": 1.8108, "step": 900100 }, { "epoch": 52.652512136632154, "grad_norm": 0.14371609687805176, "learning_rate": 0.001, "loss": 1.8069, "step": 900200 }, { "epoch": 52.65836111598526, "grad_norm": 0.18819618225097656, "learning_rate": 0.001, "loss": 1.8032, "step": 900300 }, { "epoch": 52.66421009533836, "grad_norm": 0.21680185198783875, "learning_rate": 0.001, "loss": 1.8045, "step": 900400 }, { "epoch": 52.670059074691466, "grad_norm": 0.19063672423362732, "learning_rate": 0.001, "loss": 1.8123, "step": 900500 }, { "epoch": 52.67590805404457, "grad_norm": 0.16643406450748444, "learning_rate": 0.001, "loss": 1.8072, "step": 900600 }, { "epoch": 52.68175703339767, "grad_norm": 0.15539708733558655, "learning_rate": 0.001, "loss": 1.8097, "step": 900700 }, { "epoch": 52.68760601275078, "grad_norm": 0.22145432233810425, "learning_rate": 0.001, "loss": 1.8081, "step": 900800 }, { "epoch": 52.69345499210388, "grad_norm": 0.14928673207759857, "learning_rate": 0.001, "loss": 1.8116, "step": 900900 }, { "epoch": 52.69930397145698, "grad_norm": 0.14514173567295074, "learning_rate": 0.001, "loss": 1.8074, "step": 901000 }, { "epoch": 52.70515295081008, "grad_norm": 0.17171573638916016, "learning_rate": 0.001, "loss": 1.8081, "step": 901100 }, { "epoch": 52.711001930163185, "grad_norm": 0.17964725196361542, "learning_rate": 0.001, "loss": 1.8057, "step": 901200 }, { "epoch": 52.71685090951629, "grad_norm": 0.18436263501644135, "learning_rate": 0.001, "loss": 1.8047, "step": 901300 }, { "epoch": 52.72269988886939, "grad_norm": 0.1983630210161209, "learning_rate": 0.001, "loss": 1.8042, "step": 901400 }, { "epoch": 52.7285488682225, "grad_norm": 0.1814299076795578, "learning_rate": 0.001, "loss": 1.8039, "step": 901500 }, { "epoch": 52.7343978475756, "grad_norm": 0.17505405843257904, "learning_rate": 0.001, "loss": 1.8043, "step": 901600 }, { "epoch": 52.740246826928704, "grad_norm": 0.18022149801254272, "learning_rate": 0.001, "loss": 1.8085, "step": 901700 }, { "epoch": 52.7460958062818, "grad_norm": 0.16488249599933624, "learning_rate": 0.001, "loss": 1.8047, "step": 901800 }, { "epoch": 52.751944785634905, "grad_norm": 0.23369117081165314, "learning_rate": 0.001, "loss": 1.8082, "step": 901900 }, { "epoch": 52.75779376498801, "grad_norm": 0.25007539987564087, "learning_rate": 0.001, "loss": 1.8124, "step": 902000 }, { "epoch": 52.76364274434111, "grad_norm": 0.1953391581773758, "learning_rate": 0.001, "loss": 1.8077, "step": 902100 }, { "epoch": 52.769491723694216, "grad_norm": 0.1614602953195572, "learning_rate": 0.001, "loss": 1.8015, "step": 902200 }, { "epoch": 52.77534070304732, "grad_norm": 0.17589931190013885, "learning_rate": 0.001, "loss": 1.8047, "step": 902300 }, { "epoch": 52.781189682400424, "grad_norm": 0.2570195496082306, "learning_rate": 0.001, "loss": 1.812, "step": 902400 }, { "epoch": 52.78703866175353, "grad_norm": 0.1925448179244995, "learning_rate": 0.001, "loss": 1.8118, "step": 902500 }, { "epoch": 52.792887641106624, "grad_norm": 0.16877110302448273, "learning_rate": 0.001, "loss": 1.7943, "step": 902600 }, { "epoch": 52.79873662045973, "grad_norm": 0.1608353555202484, "learning_rate": 0.001, "loss": 1.7928, "step": 902700 }, { "epoch": 52.80458559981283, "grad_norm": 0.21970918774604797, "learning_rate": 0.001, "loss": 1.7883, "step": 902800 }, { "epoch": 52.810434579165936, "grad_norm": 0.19483298063278198, "learning_rate": 0.001, "loss": 1.7893, "step": 902900 }, { "epoch": 52.81628355851904, "grad_norm": 0.22736679017543793, "learning_rate": 0.001, "loss": 1.7986, "step": 903000 }, { "epoch": 52.82213253787214, "grad_norm": 0.2125736027956009, "learning_rate": 0.001, "loss": 1.7961, "step": 903100 }, { "epoch": 52.82798151722525, "grad_norm": 0.1929914951324463, "learning_rate": 0.001, "loss": 1.7976, "step": 903200 }, { "epoch": 52.833830496578344, "grad_norm": 0.19562916457653046, "learning_rate": 0.001, "loss": 1.794, "step": 903300 }, { "epoch": 52.83967947593145, "grad_norm": 0.1789969950914383, "learning_rate": 0.001, "loss": 1.7983, "step": 903400 }, { "epoch": 52.84552845528455, "grad_norm": 0.19824253022670746, "learning_rate": 0.001, "loss": 1.7982, "step": 903500 }, { "epoch": 52.851377434637655, "grad_norm": 0.16617698967456818, "learning_rate": 0.001, "loss": 1.7984, "step": 903600 }, { "epoch": 52.85722641399076, "grad_norm": 0.16371774673461914, "learning_rate": 0.001, "loss": 1.7952, "step": 903700 }, { "epoch": 52.86307539334386, "grad_norm": 0.1842002272605896, "learning_rate": 0.001, "loss": 1.7952, "step": 903800 }, { "epoch": 52.868924372696966, "grad_norm": 0.1541849672794342, "learning_rate": 0.001, "loss": 1.7973, "step": 903900 }, { "epoch": 52.87477335205007, "grad_norm": 0.1603274792432785, "learning_rate": 0.001, "loss": 1.7901, "step": 904000 }, { "epoch": 52.88062233140317, "grad_norm": 0.15143005549907684, "learning_rate": 0.001, "loss": 1.7998, "step": 904100 }, { "epoch": 52.88647131075627, "grad_norm": 0.16739191114902496, "learning_rate": 0.001, "loss": 1.8012, "step": 904200 }, { "epoch": 52.892320290109375, "grad_norm": 0.22262471914291382, "learning_rate": 0.001, "loss": 1.799, "step": 904300 }, { "epoch": 52.89816926946248, "grad_norm": 0.18230664730072021, "learning_rate": 0.001, "loss": 1.8051, "step": 904400 }, { "epoch": 52.90401824881558, "grad_norm": 0.15553690493106842, "learning_rate": 0.001, "loss": 1.7984, "step": 904500 }, { "epoch": 52.909867228168686, "grad_norm": 0.18141710758209229, "learning_rate": 0.001, "loss": 1.8018, "step": 904600 }, { "epoch": 52.91571620752179, "grad_norm": 0.16766127943992615, "learning_rate": 0.001, "loss": 1.7937, "step": 904700 }, { "epoch": 52.92156518687489, "grad_norm": 0.1784592866897583, "learning_rate": 0.001, "loss": 1.7999, "step": 904800 }, { "epoch": 52.92741416622799, "grad_norm": 0.14862804114818573, "learning_rate": 0.001, "loss": 1.8001, "step": 904900 }, { "epoch": 52.933263145581094, "grad_norm": 0.1595689207315445, "learning_rate": 0.001, "loss": 1.8054, "step": 905000 }, { "epoch": 52.9391121249342, "grad_norm": 0.22893297672271729, "learning_rate": 0.001, "loss": 1.7997, "step": 905100 }, { "epoch": 52.9449611042873, "grad_norm": 0.1879865825176239, "learning_rate": 0.001, "loss": 1.8007, "step": 905200 }, { "epoch": 52.950810083640405, "grad_norm": 0.15640312433242798, "learning_rate": 0.001, "loss": 1.803, "step": 905300 }, { "epoch": 52.95665906299351, "grad_norm": 0.1542154997587204, "learning_rate": 0.001, "loss": 1.8004, "step": 905400 }, { "epoch": 52.96250804234661, "grad_norm": 0.21073929965496063, "learning_rate": 0.001, "loss": 1.7988, "step": 905500 }, { "epoch": 52.96835702169972, "grad_norm": 0.19387507438659668, "learning_rate": 0.001, "loss": 1.7968, "step": 905600 }, { "epoch": 52.97420600105281, "grad_norm": 0.2154369354248047, "learning_rate": 0.001, "loss": 1.8071, "step": 905700 }, { "epoch": 52.98005498040592, "grad_norm": 0.21744127571582794, "learning_rate": 0.001, "loss": 1.7999, "step": 905800 }, { "epoch": 52.98590395975902, "grad_norm": 0.1773749738931656, "learning_rate": 0.001, "loss": 1.8074, "step": 905900 }, { "epoch": 52.991752939112125, "grad_norm": 0.1470848023891449, "learning_rate": 0.001, "loss": 1.7964, "step": 906000 }, { "epoch": 52.99760191846523, "grad_norm": 0.2519082725048065, "learning_rate": 0.001, "loss": 1.8038, "step": 906100 }, { "epoch": 53.00345089781833, "grad_norm": 0.22615687549114227, "learning_rate": 0.001, "loss": 1.7977, "step": 906200 }, { "epoch": 53.009299877171436, "grad_norm": 0.21288511157035828, "learning_rate": 0.001, "loss": 1.7949, "step": 906300 }, { "epoch": 53.01514885652453, "grad_norm": 0.21828293800354004, "learning_rate": 0.001, "loss": 1.7961, "step": 906400 }, { "epoch": 53.02099783587764, "grad_norm": 0.1808352917432785, "learning_rate": 0.001, "loss": 1.7953, "step": 906500 }, { "epoch": 53.02684681523074, "grad_norm": 0.18826042115688324, "learning_rate": 0.001, "loss": 1.7945, "step": 906600 }, { "epoch": 53.032695794583844, "grad_norm": 0.21906186640262604, "learning_rate": 0.001, "loss": 1.7983, "step": 906700 }, { "epoch": 53.03854477393695, "grad_norm": 0.20675019919872284, "learning_rate": 0.001, "loss": 1.7909, "step": 906800 }, { "epoch": 53.04439375329005, "grad_norm": 0.17468947172164917, "learning_rate": 0.001, "loss": 1.7927, "step": 906900 }, { "epoch": 53.050242732643156, "grad_norm": 0.21767650544643402, "learning_rate": 0.001, "loss": 1.7997, "step": 907000 }, { "epoch": 53.05609171199626, "grad_norm": 0.1753608137369156, "learning_rate": 0.001, "loss": 1.7952, "step": 907100 }, { "epoch": 53.061940691349356, "grad_norm": 0.2418382465839386, "learning_rate": 0.001, "loss": 1.7985, "step": 907200 }, { "epoch": 53.06778967070246, "grad_norm": 0.20385049283504486, "learning_rate": 0.001, "loss": 1.7927, "step": 907300 }, { "epoch": 53.073638650055564, "grad_norm": 0.1625831127166748, "learning_rate": 0.001, "loss": 1.7904, "step": 907400 }, { "epoch": 53.07948762940867, "grad_norm": 0.23568032681941986, "learning_rate": 0.001, "loss": 1.7975, "step": 907500 }, { "epoch": 53.08533660876177, "grad_norm": 0.16136771440505981, "learning_rate": 0.001, "loss": 1.7924, "step": 907600 }, { "epoch": 53.091185588114875, "grad_norm": 0.1529301106929779, "learning_rate": 0.001, "loss": 1.7985, "step": 907700 }, { "epoch": 53.09703456746798, "grad_norm": 0.15414942800998688, "learning_rate": 0.001, "loss": 1.7952, "step": 907800 }, { "epoch": 53.10288354682108, "grad_norm": 0.17691224813461304, "learning_rate": 0.001, "loss": 1.7982, "step": 907900 }, { "epoch": 53.10873252617418, "grad_norm": 0.20476305484771729, "learning_rate": 0.001, "loss": 1.7973, "step": 908000 }, { "epoch": 53.11458150552728, "grad_norm": 0.15794917941093445, "learning_rate": 0.001, "loss": 1.798, "step": 908100 }, { "epoch": 53.12043048488039, "grad_norm": 0.18634013831615448, "learning_rate": 0.001, "loss": 1.7918, "step": 908200 }, { "epoch": 53.12627946423349, "grad_norm": 0.22104264795780182, "learning_rate": 0.001, "loss": 1.7968, "step": 908300 }, { "epoch": 53.132128443586595, "grad_norm": 0.18674562871456146, "learning_rate": 0.001, "loss": 1.7968, "step": 908400 }, { "epoch": 53.1379774229397, "grad_norm": 0.142625629901886, "learning_rate": 0.001, "loss": 1.8013, "step": 908500 }, { "epoch": 53.1438264022928, "grad_norm": 0.16643668711185455, "learning_rate": 0.001, "loss": 1.7991, "step": 908600 }, { "epoch": 53.149675381645906, "grad_norm": 0.2430828958749771, "learning_rate": 0.001, "loss": 1.7995, "step": 908700 }, { "epoch": 53.155524360999, "grad_norm": 0.18392018973827362, "learning_rate": 0.001, "loss": 1.7994, "step": 908800 }, { "epoch": 53.16137334035211, "grad_norm": 0.19530388712882996, "learning_rate": 0.001, "loss": 1.7988, "step": 908900 }, { "epoch": 53.16722231970521, "grad_norm": 0.19587300717830658, "learning_rate": 0.001, "loss": 1.7993, "step": 909000 }, { "epoch": 53.173071299058314, "grad_norm": 0.20699535310268402, "learning_rate": 0.001, "loss": 1.7969, "step": 909100 }, { "epoch": 53.17892027841142, "grad_norm": 0.14402490854263306, "learning_rate": 0.001, "loss": 1.7925, "step": 909200 }, { "epoch": 53.18476925776452, "grad_norm": 0.2024659514427185, "learning_rate": 0.001, "loss": 1.8056, "step": 909300 }, { "epoch": 53.190618237117626, "grad_norm": 0.18003125488758087, "learning_rate": 0.001, "loss": 1.7995, "step": 909400 }, { "epoch": 53.19646721647072, "grad_norm": 0.20941558480262756, "learning_rate": 0.001, "loss": 1.8023, "step": 909500 }, { "epoch": 53.202316195823826, "grad_norm": 0.18655502796173096, "learning_rate": 0.001, "loss": 1.7986, "step": 909600 }, { "epoch": 53.20816517517693, "grad_norm": 0.1934581845998764, "learning_rate": 0.001, "loss": 1.7972, "step": 909700 }, { "epoch": 53.214014154530034, "grad_norm": 0.16644617915153503, "learning_rate": 0.001, "loss": 1.7964, "step": 909800 }, { "epoch": 53.21986313388314, "grad_norm": 0.22064216434955597, "learning_rate": 0.001, "loss": 1.8005, "step": 909900 }, { "epoch": 53.22571211323624, "grad_norm": 0.20992380380630493, "learning_rate": 0.001, "loss": 1.8053, "step": 910000 }, { "epoch": 53.231561092589345, "grad_norm": 0.1920645385980606, "learning_rate": 0.001, "loss": 1.7965, "step": 910100 }, { "epoch": 53.23741007194245, "grad_norm": 0.18586745858192444, "learning_rate": 0.001, "loss": 1.8003, "step": 910200 }, { "epoch": 53.243259051295546, "grad_norm": 0.22286686301231384, "learning_rate": 0.001, "loss": 1.8029, "step": 910300 }, { "epoch": 53.24910803064865, "grad_norm": 0.17845207452774048, "learning_rate": 0.001, "loss": 1.7985, "step": 910400 }, { "epoch": 53.25495701000175, "grad_norm": 0.17839385569095612, "learning_rate": 0.001, "loss": 1.8004, "step": 910500 }, { "epoch": 53.26080598935486, "grad_norm": 0.17595906555652618, "learning_rate": 0.001, "loss": 1.7985, "step": 910600 }, { "epoch": 53.26665496870796, "grad_norm": 0.1959364414215088, "learning_rate": 0.001, "loss": 1.7988, "step": 910700 }, { "epoch": 53.272503948061065, "grad_norm": 0.20846419036388397, "learning_rate": 0.001, "loss": 1.8038, "step": 910800 }, { "epoch": 53.27835292741417, "grad_norm": 0.17682015895843506, "learning_rate": 0.001, "loss": 1.807, "step": 910900 }, { "epoch": 53.28420190676727, "grad_norm": 0.23466593027114868, "learning_rate": 0.001, "loss": 1.803, "step": 911000 }, { "epoch": 53.29005088612037, "grad_norm": 0.15798407793045044, "learning_rate": 0.001, "loss": 1.7959, "step": 911100 }, { "epoch": 53.29589986547347, "grad_norm": 0.2214352935552597, "learning_rate": 0.001, "loss": 1.805, "step": 911200 }, { "epoch": 53.30174884482658, "grad_norm": 0.20446190237998962, "learning_rate": 0.001, "loss": 1.7982, "step": 911300 }, { "epoch": 53.30759782417968, "grad_norm": 0.20673994719982147, "learning_rate": 0.001, "loss": 1.7987, "step": 911400 }, { "epoch": 53.313446803532784, "grad_norm": 0.22247104346752167, "learning_rate": 0.001, "loss": 1.7999, "step": 911500 }, { "epoch": 53.31929578288589, "grad_norm": 0.1798202395439148, "learning_rate": 0.001, "loss": 1.7977, "step": 911600 }, { "epoch": 53.32514476223899, "grad_norm": 0.18723517656326294, "learning_rate": 0.001, "loss": 1.7994, "step": 911700 }, { "epoch": 53.330993741592096, "grad_norm": 0.2250414788722992, "learning_rate": 0.001, "loss": 1.8037, "step": 911800 }, { "epoch": 53.33684272094519, "grad_norm": 0.1705757975578308, "learning_rate": 0.001, "loss": 1.8019, "step": 911900 }, { "epoch": 53.342691700298296, "grad_norm": 0.23035570979118347, "learning_rate": 0.001, "loss": 1.7977, "step": 912000 }, { "epoch": 53.3485406796514, "grad_norm": 0.1712081879377365, "learning_rate": 0.001, "loss": 1.8016, "step": 912100 }, { "epoch": 53.354389659004504, "grad_norm": 0.21949328482151031, "learning_rate": 0.001, "loss": 1.8018, "step": 912200 }, { "epoch": 53.36023863835761, "grad_norm": 0.2648165822029114, "learning_rate": 0.001, "loss": 1.8016, "step": 912300 }, { "epoch": 53.36608761771071, "grad_norm": 0.17409060895442963, "learning_rate": 0.001, "loss": 1.8032, "step": 912400 }, { "epoch": 53.371936597063815, "grad_norm": 0.21147267520427704, "learning_rate": 0.001, "loss": 1.8076, "step": 912500 }, { "epoch": 53.37778557641691, "grad_norm": 0.16608262062072754, "learning_rate": 0.001, "loss": 1.8046, "step": 912600 }, { "epoch": 53.383634555770016, "grad_norm": 0.163363516330719, "learning_rate": 0.001, "loss": 1.8056, "step": 912700 }, { "epoch": 53.38948353512312, "grad_norm": 0.1778886765241623, "learning_rate": 0.001, "loss": 1.7998, "step": 912800 }, { "epoch": 53.39533251447622, "grad_norm": 0.21118977665901184, "learning_rate": 0.001, "loss": 1.8038, "step": 912900 }, { "epoch": 53.40118149382933, "grad_norm": 0.18045955896377563, "learning_rate": 0.001, "loss": 1.8003, "step": 913000 }, { "epoch": 53.40703047318243, "grad_norm": 0.21060700714588165, "learning_rate": 0.001, "loss": 1.7986, "step": 913100 }, { "epoch": 53.412879452535535, "grad_norm": 0.1447283774614334, "learning_rate": 0.001, "loss": 1.7988, "step": 913200 }, { "epoch": 53.41872843188864, "grad_norm": 0.2006257027387619, "learning_rate": 0.001, "loss": 1.8017, "step": 913300 }, { "epoch": 53.424577411241735, "grad_norm": 0.18175281584262848, "learning_rate": 0.001, "loss": 1.8023, "step": 913400 }, { "epoch": 53.43042639059484, "grad_norm": 0.17473722994327545, "learning_rate": 0.001, "loss": 1.8058, "step": 913500 }, { "epoch": 53.43627536994794, "grad_norm": 0.18837907910346985, "learning_rate": 0.001, "loss": 1.8072, "step": 913600 }, { "epoch": 53.44212434930105, "grad_norm": 0.26486220955848694, "learning_rate": 0.001, "loss": 1.8009, "step": 913700 }, { "epoch": 53.44797332865415, "grad_norm": 0.16357354819774628, "learning_rate": 0.001, "loss": 1.8038, "step": 913800 }, { "epoch": 53.453822308007254, "grad_norm": 0.1745329648256302, "learning_rate": 0.001, "loss": 1.8083, "step": 913900 }, { "epoch": 53.45967128736036, "grad_norm": 0.191594660282135, "learning_rate": 0.001, "loss": 1.8021, "step": 914000 }, { "epoch": 53.46552026671346, "grad_norm": 0.1494152843952179, "learning_rate": 0.001, "loss": 1.8045, "step": 914100 }, { "epoch": 53.47136924606656, "grad_norm": 0.1644660085439682, "learning_rate": 0.001, "loss": 1.8001, "step": 914200 }, { "epoch": 53.47721822541966, "grad_norm": 0.17572087049484253, "learning_rate": 0.001, "loss": 1.8005, "step": 914300 }, { "epoch": 53.483067204772766, "grad_norm": 0.1938330978155136, "learning_rate": 0.001, "loss": 1.7986, "step": 914400 }, { "epoch": 53.48891618412587, "grad_norm": 0.22227266430854797, "learning_rate": 0.001, "loss": 1.8028, "step": 914500 }, { "epoch": 53.494765163478974, "grad_norm": 0.2699006497859955, "learning_rate": 0.001, "loss": 1.8074, "step": 914600 }, { "epoch": 53.50061414283208, "grad_norm": 0.1971624195575714, "learning_rate": 0.001, "loss": 1.8033, "step": 914700 }, { "epoch": 53.50646312218518, "grad_norm": 0.18380236625671387, "learning_rate": 0.001, "loss": 1.8044, "step": 914800 }, { "epoch": 53.512312101538285, "grad_norm": 0.23167690634727478, "learning_rate": 0.001, "loss": 1.8004, "step": 914900 }, { "epoch": 53.51816108089138, "grad_norm": 0.1900414228439331, "learning_rate": 0.001, "loss": 1.8002, "step": 915000 }, { "epoch": 53.524010060244485, "grad_norm": 0.23938307166099548, "learning_rate": 0.001, "loss": 1.8028, "step": 915100 }, { "epoch": 53.52985903959759, "grad_norm": 0.19675691425800323, "learning_rate": 0.001, "loss": 1.8093, "step": 915200 }, { "epoch": 53.53570801895069, "grad_norm": 0.16388198733329773, "learning_rate": 0.001, "loss": 1.8, "step": 915300 }, { "epoch": 53.5415569983038, "grad_norm": 0.19076770544052124, "learning_rate": 0.001, "loss": 1.8071, "step": 915400 }, { "epoch": 53.5474059776569, "grad_norm": 0.18015378713607788, "learning_rate": 0.001, "loss": 1.8108, "step": 915500 }, { "epoch": 53.553254957010004, "grad_norm": 0.16953089833259583, "learning_rate": 0.001, "loss": 1.8036, "step": 915600 }, { "epoch": 53.5591039363631, "grad_norm": 0.18850058317184448, "learning_rate": 0.001, "loss": 1.8041, "step": 915700 }, { "epoch": 53.564952915716205, "grad_norm": 0.18099941313266754, "learning_rate": 0.001, "loss": 1.7992, "step": 915800 }, { "epoch": 53.57080189506931, "grad_norm": 0.164703369140625, "learning_rate": 0.001, "loss": 1.8026, "step": 915900 }, { "epoch": 53.57665087442241, "grad_norm": 0.17060422897338867, "learning_rate": 0.001, "loss": 1.8041, "step": 916000 }, { "epoch": 53.582499853775516, "grad_norm": 0.2000841200351715, "learning_rate": 0.001, "loss": 1.7984, "step": 916100 }, { "epoch": 53.58834883312862, "grad_norm": 0.16528616845607758, "learning_rate": 0.001, "loss": 1.806, "step": 916200 }, { "epoch": 53.594197812481724, "grad_norm": 0.16093014180660248, "learning_rate": 0.001, "loss": 1.8029, "step": 916300 }, { "epoch": 53.60004679183483, "grad_norm": 0.22338904440402985, "learning_rate": 0.001, "loss": 1.8015, "step": 916400 }, { "epoch": 53.605895771187924, "grad_norm": 0.21029198169708252, "learning_rate": 0.001, "loss": 1.8062, "step": 916500 }, { "epoch": 53.61174475054103, "grad_norm": 0.2806776165962219, "learning_rate": 0.001, "loss": 1.8019, "step": 916600 }, { "epoch": 53.61759372989413, "grad_norm": 0.1746712028980255, "learning_rate": 0.001, "loss": 1.8049, "step": 916700 }, { "epoch": 53.623442709247236, "grad_norm": 0.17881213128566742, "learning_rate": 0.001, "loss": 1.8071, "step": 916800 }, { "epoch": 53.62929168860034, "grad_norm": 0.1920963078737259, "learning_rate": 0.001, "loss": 1.8042, "step": 916900 }, { "epoch": 53.63514066795344, "grad_norm": 0.16117225587368011, "learning_rate": 0.001, "loss": 1.8054, "step": 917000 }, { "epoch": 53.64098964730655, "grad_norm": 0.20064759254455566, "learning_rate": 0.001, "loss": 1.8022, "step": 917100 }, { "epoch": 53.64683862665965, "grad_norm": 0.2181105762720108, "learning_rate": 0.001, "loss": 1.8094, "step": 917200 }, { "epoch": 53.65268760601275, "grad_norm": 0.27038291096687317, "learning_rate": 0.001, "loss": 1.8053, "step": 917300 }, { "epoch": 53.65853658536585, "grad_norm": 0.20234224200248718, "learning_rate": 0.001, "loss": 1.8016, "step": 917400 }, { "epoch": 53.664385564718955, "grad_norm": 0.15650725364685059, "learning_rate": 0.001, "loss": 1.8019, "step": 917500 }, { "epoch": 53.67023454407206, "grad_norm": 0.1678699254989624, "learning_rate": 0.001, "loss": 1.8002, "step": 917600 }, { "epoch": 53.67608352342516, "grad_norm": 0.20242491364479065, "learning_rate": 0.001, "loss": 1.8069, "step": 917700 }, { "epoch": 53.68193250277827, "grad_norm": 0.17165781557559967, "learning_rate": 0.001, "loss": 1.8059, "step": 917800 }, { "epoch": 53.68778148213137, "grad_norm": 0.16661514341831207, "learning_rate": 0.001, "loss": 1.8038, "step": 917900 }, { "epoch": 53.693630461484474, "grad_norm": 0.1788959950208664, "learning_rate": 0.001, "loss": 1.8052, "step": 918000 }, { "epoch": 53.69947944083757, "grad_norm": 0.2314789891242981, "learning_rate": 0.001, "loss": 1.8066, "step": 918100 }, { "epoch": 53.705328420190675, "grad_norm": 0.1948019415140152, "learning_rate": 0.001, "loss": 1.804, "step": 918200 }, { "epoch": 53.71117739954378, "grad_norm": 0.16298678517341614, "learning_rate": 0.001, "loss": 1.8103, "step": 918300 }, { "epoch": 53.71702637889688, "grad_norm": 0.1917489767074585, "learning_rate": 0.001, "loss": 1.8058, "step": 918400 }, { "epoch": 53.722875358249986, "grad_norm": 0.20819790661334991, "learning_rate": 0.001, "loss": 1.8042, "step": 918500 }, { "epoch": 53.72872433760309, "grad_norm": 0.1880819946527481, "learning_rate": 0.001, "loss": 1.8064, "step": 918600 }, { "epoch": 53.734573316956194, "grad_norm": 0.18953478336334229, "learning_rate": 0.001, "loss": 1.8027, "step": 918700 }, { "epoch": 53.74042229630929, "grad_norm": 0.17701885104179382, "learning_rate": 0.001, "loss": 1.806, "step": 918800 }, { "epoch": 53.746271275662394, "grad_norm": 0.1711975485086441, "learning_rate": 0.001, "loss": 1.8039, "step": 918900 }, { "epoch": 53.7521202550155, "grad_norm": 0.2450493574142456, "learning_rate": 0.001, "loss": 1.8041, "step": 919000 }, { "epoch": 53.7579692343686, "grad_norm": 0.2774653136730194, "learning_rate": 0.001, "loss": 1.8127, "step": 919100 }, { "epoch": 53.763818213721706, "grad_norm": 0.19857966899871826, "learning_rate": 0.001, "loss": 1.8097, "step": 919200 }, { "epoch": 53.76966719307481, "grad_norm": 0.19539853930473328, "learning_rate": 0.001, "loss": 1.8026, "step": 919300 }, { "epoch": 53.77551617242791, "grad_norm": 0.2113969773054123, "learning_rate": 0.001, "loss": 1.8051, "step": 919400 }, { "epoch": 53.78136515178102, "grad_norm": 0.1953083574771881, "learning_rate": 0.001, "loss": 1.8069, "step": 919500 }, { "epoch": 53.787214131134114, "grad_norm": 0.1773470640182495, "learning_rate": 0.001, "loss": 1.8063, "step": 919600 }, { "epoch": 53.79306311048722, "grad_norm": 0.18657200038433075, "learning_rate": 0.001, "loss": 1.8041, "step": 919700 }, { "epoch": 53.79891208984032, "grad_norm": 0.15921537578105927, "learning_rate": 0.001, "loss": 1.8015, "step": 919800 }, { "epoch": 53.804761069193425, "grad_norm": 0.1906316578388214, "learning_rate": 0.001, "loss": 1.8066, "step": 919900 }, { "epoch": 53.81061004854653, "grad_norm": 0.16614070534706116, "learning_rate": 0.001, "loss": 1.8069, "step": 920000 }, { "epoch": 53.81645902789963, "grad_norm": 0.16572076082229614, "learning_rate": 0.001, "loss": 1.8073, "step": 920100 }, { "epoch": 53.82230800725274, "grad_norm": 0.14847983419895172, "learning_rate": 0.001, "loss": 1.805, "step": 920200 }, { "epoch": 53.82815698660584, "grad_norm": 0.1563614010810852, "learning_rate": 0.001, "loss": 1.8134, "step": 920300 }, { "epoch": 53.83400596595894, "grad_norm": 0.2273845076560974, "learning_rate": 0.001, "loss": 1.8094, "step": 920400 }, { "epoch": 53.83985494531204, "grad_norm": 0.21243011951446533, "learning_rate": 0.001, "loss": 1.8054, "step": 920500 }, { "epoch": 53.845703924665145, "grad_norm": 0.1683681458234787, "learning_rate": 0.001, "loss": 1.8032, "step": 920600 }, { "epoch": 53.85155290401825, "grad_norm": 0.19227854907512665, "learning_rate": 0.001, "loss": 1.8077, "step": 920700 }, { "epoch": 53.85740188337135, "grad_norm": 0.21034404635429382, "learning_rate": 0.001, "loss": 1.8073, "step": 920800 }, { "epoch": 53.863250862724456, "grad_norm": 0.17476044595241547, "learning_rate": 0.001, "loss": 1.8068, "step": 920900 }, { "epoch": 53.86909984207756, "grad_norm": 0.18261952698230743, "learning_rate": 0.001, "loss": 1.8069, "step": 921000 }, { "epoch": 53.874948821430664, "grad_norm": 0.16866469383239746, "learning_rate": 0.001, "loss": 1.8074, "step": 921100 }, { "epoch": 53.88079780078376, "grad_norm": 0.18017403781414032, "learning_rate": 0.001, "loss": 1.8018, "step": 921200 }, { "epoch": 53.886646780136864, "grad_norm": 0.1619345098733902, "learning_rate": 0.001, "loss": 1.8052, "step": 921300 }, { "epoch": 53.89249575948997, "grad_norm": 0.19691301882266998, "learning_rate": 0.001, "loss": 1.8098, "step": 921400 }, { "epoch": 53.89834473884307, "grad_norm": 0.18001435697078705, "learning_rate": 0.001, "loss": 1.8047, "step": 921500 }, { "epoch": 53.904193718196176, "grad_norm": 0.23341450095176697, "learning_rate": 0.001, "loss": 1.8055, "step": 921600 }, { "epoch": 53.91004269754928, "grad_norm": 0.21143603324890137, "learning_rate": 0.001, "loss": 1.8053, "step": 921700 }, { "epoch": 53.91589167690238, "grad_norm": 0.22777527570724487, "learning_rate": 0.001, "loss": 1.8051, "step": 921800 }, { "epoch": 53.92174065625548, "grad_norm": 0.21759505569934845, "learning_rate": 0.001, "loss": 1.8081, "step": 921900 }, { "epoch": 53.927589635608584, "grad_norm": 0.26667311787605286, "learning_rate": 0.001, "loss": 1.808, "step": 922000 }, { "epoch": 53.93343861496169, "grad_norm": 0.15482760965824127, "learning_rate": 0.001, "loss": 1.8118, "step": 922100 }, { "epoch": 53.93928759431479, "grad_norm": 0.15052111446857452, "learning_rate": 0.001, "loss": 1.8054, "step": 922200 }, { "epoch": 53.945136573667895, "grad_norm": 0.20457608997821808, "learning_rate": 0.001, "loss": 1.8077, "step": 922300 }, { "epoch": 53.950985553021, "grad_norm": 0.1930389702320099, "learning_rate": 0.001, "loss": 1.8057, "step": 922400 }, { "epoch": 53.9568345323741, "grad_norm": 0.1791686713695526, "learning_rate": 0.001, "loss": 1.8017, "step": 922500 }, { "epoch": 53.96268351172721, "grad_norm": 0.15721353888511658, "learning_rate": 0.001, "loss": 1.8104, "step": 922600 }, { "epoch": 53.9685324910803, "grad_norm": 0.17910636961460114, "learning_rate": 0.001, "loss": 1.8088, "step": 922700 }, { "epoch": 53.97438147043341, "grad_norm": 0.1754935383796692, "learning_rate": 0.001, "loss": 1.8057, "step": 922800 }, { "epoch": 53.98023044978651, "grad_norm": 0.20626859366893768, "learning_rate": 0.001, "loss": 1.8088, "step": 922900 }, { "epoch": 53.986079429139615, "grad_norm": 0.15771768987178802, "learning_rate": 0.001, "loss": 1.8089, "step": 923000 }, { "epoch": 53.99192840849272, "grad_norm": 0.1567176878452301, "learning_rate": 0.001, "loss": 1.8039, "step": 923100 }, { "epoch": 53.99777738784582, "grad_norm": 0.1891246736049652, "learning_rate": 0.001, "loss": 1.8055, "step": 923200 }, { "epoch": 54.003626367198926, "grad_norm": 0.1867973506450653, "learning_rate": 0.001, "loss": 1.8, "step": 923300 }, { "epoch": 54.00947534655203, "grad_norm": 0.21076641976833344, "learning_rate": 0.001, "loss": 1.8, "step": 923400 }, { "epoch": 54.01532432590513, "grad_norm": 0.18497751653194427, "learning_rate": 0.001, "loss": 1.7896, "step": 923500 }, { "epoch": 54.02117330525823, "grad_norm": 0.21567688882350922, "learning_rate": 0.001, "loss": 1.7955, "step": 923600 }, { "epoch": 54.027022284611334, "grad_norm": 0.18422237038612366, "learning_rate": 0.001, "loss": 1.7895, "step": 923700 }, { "epoch": 54.03287126396444, "grad_norm": 0.21044890582561493, "learning_rate": 0.001, "loss": 1.7923, "step": 923800 }, { "epoch": 54.03872024331754, "grad_norm": 0.27773308753967285, "learning_rate": 0.001, "loss": 1.7877, "step": 923900 }, { "epoch": 54.044569222670646, "grad_norm": 0.1967128962278366, "learning_rate": 0.001, "loss": 1.7947, "step": 924000 }, { "epoch": 54.05041820202375, "grad_norm": 0.23424091935157776, "learning_rate": 0.001, "loss": 1.796, "step": 924100 }, { "epoch": 54.05626718137685, "grad_norm": 0.29848513007164, "learning_rate": 0.001, "loss": 1.796, "step": 924200 }, { "epoch": 54.06211616072995, "grad_norm": 0.2240450084209442, "learning_rate": 0.001, "loss": 1.7944, "step": 924300 }, { "epoch": 54.067965140083054, "grad_norm": 0.23215866088867188, "learning_rate": 0.001, "loss": 1.7961, "step": 924400 }, { "epoch": 54.07381411943616, "grad_norm": 0.1945401430130005, "learning_rate": 0.001, "loss": 1.8016, "step": 924500 }, { "epoch": 54.07966309878926, "grad_norm": 0.23610635101795197, "learning_rate": 0.001, "loss": 1.7956, "step": 924600 }, { "epoch": 54.085512078142365, "grad_norm": 0.2418433576822281, "learning_rate": 0.001, "loss": 1.7942, "step": 924700 }, { "epoch": 54.09136105749547, "grad_norm": 0.3301449418067932, "learning_rate": 0.001, "loss": 1.8024, "step": 924800 }, { "epoch": 54.09721003684857, "grad_norm": 0.1934586614370346, "learning_rate": 0.001, "loss": 1.7979, "step": 924900 }, { "epoch": 54.10305901620167, "grad_norm": 0.20883384346961975, "learning_rate": 0.001, "loss": 1.7934, "step": 925000 }, { "epoch": 54.10305901620167, "eval_ag_news_accuracy": 0.24325, "eval_ag_news_bleu_score": 7.582478699917397, "eval_ag_news_bleu_score_sem": 0.4990437622786672, "eval_ag_news_emb_cos_sim": 0.7481516003608704, "eval_ag_news_emb_cos_sim_sem": 0.011386849917471409, "eval_ag_news_emb_top1_equal": 0.9765625, "eval_ag_news_emb_top1_equal_sem": 0.013424675911664963, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7200570106506348, "eval_ag_news_n_ngrams_match_1": 14.8828125, "eval_ag_news_n_ngrams_match_2": 4.484375, "eval_ag_news_n_ngrams_match_3": 1.7265625, "eval_ag_news_num_pred_words": 45.390625, "eval_ag_news_num_true_words": 43.765625, "eval_ag_news_perplexity": 15.181187709672045, "eval_ag_news_pred_num_tokens": 67.0625, "eval_ag_news_rouge_score": 0.325693419727252, "eval_ag_news_runtime": 38.9197, "eval_ag_news_samples_per_second": 12.847, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.35591625135189325, "eval_ag_news_token_set_f1_sem": 0.008401237586867472, "eval_ag_news_token_set_precision": 0.33920563753289695, "eval_ag_news_token_set_recall": 0.37971490075204783, "eval_ag_news_true_num_tokens": 58.609375, "step": 925000 }, { "epoch": 54.10305901620167, "eval_anthropic_toxic_prompts_accuracy": 0.1051875, "eval_anthropic_toxic_prompts_bleu_score": 42.846831063206835, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.720699935425919, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9030241966247559, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008478784002363682, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.140625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030847556262404395, "eval_anthropic_toxic_prompts_loss": 1.2961560487747192, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.328125, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.7578125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.7109375, "eval_anthropic_toxic_prompts_num_pred_words": 15.2109375, "eval_anthropic_toxic_prompts_num_true_words": 14.53125, "eval_anthropic_toxic_prompts_perplexity": 3.6552191441886244, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.6640625, "eval_anthropic_toxic_prompts_rouge_score": 0.6974431705784087, "eval_anthropic_toxic_prompts_runtime": 30.1562, "eval_anthropic_toxic_prompts_samples_per_second": 16.58, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.7082158826414259, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018593430696802724, "eval_anthropic_toxic_prompts_token_set_precision": 0.7093964535505978, "eval_anthropic_toxic_prompts_token_set_recall": 0.7135920446836886, "eval_anthropic_toxic_prompts_true_num_tokens": 17.4609375, "step": 925000 }, { "epoch": 54.10305901620167, "eval_arxiv_accuracy": 0.379953125, "eval_arxiv_bleu_score": 1.6078600925451951, "eval_arxiv_bleu_score_sem": 0.1430945110957893, "eval_arxiv_emb_cos_sim": 0.48638129234313965, "eval_arxiv_emb_cos_sim_sem": 0.018747422844171524, "eval_arxiv_emb_top1_equal": 0.9140625, "eval_arxiv_emb_top1_equal_sem": 0.024870097637176514, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4077956676483154, "eval_arxiv_n_ngrams_match_1": 13.2265625, "eval_arxiv_n_ngrams_match_2": 2.21875, "eval_arxiv_n_ngrams_match_3": 0.390625, "eval_arxiv_num_pred_words": 54.6875, "eval_arxiv_num_true_words": 85.5078125, "eval_arxiv_perplexity": 30.19860307898699, "eval_arxiv_pred_num_tokens": 125.4140625, "eval_arxiv_rouge_score": 0.17490412481245832, "eval_arxiv_runtime": 30.5201, "eval_arxiv_samples_per_second": 16.383, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.1715476838003916, "eval_arxiv_token_set_f1_sem": 0.008410534431302899, "eval_arxiv_token_set_precision": 0.11642134950790409, "eval_arxiv_token_set_recall": 0.4078834491562462, "eval_arxiv_true_num_tokens": 125.0546875, "step": 925000 }, { "epoch": 54.10305901620167, "eval_python_code_alpaca_accuracy": 0.13328125, "eval_python_code_alpaca_bleu_score": 32.68011678378821, "eval_python_code_alpaca_bleu_score_sem": 1.7819472522913695, "eval_python_code_alpaca_emb_cos_sim": 0.8841678500175476, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008315179497003555, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0234375, "eval_python_code_alpaca_exact_match_sem": 0.013424675715302162, "eval_python_code_alpaca_loss": 1.4982088804244995, "eval_python_code_alpaca_n_ngrams_match_1": 11.1640625, "eval_python_code_alpaca_n_ngrams_match_2": 6.5625, "eval_python_code_alpaca_n_ngrams_match_3": 4.0, "eval_python_code_alpaca_num_pred_words": 18.140625, "eval_python_code_alpaca_num_true_words": 19.0546875, "eval_python_code_alpaca_perplexity": 4.473669013897021, "eval_python_code_alpaca_pred_num_tokens": 24.46875, "eval_python_code_alpaca_rouge_score": 0.6280921604650768, "eval_python_code_alpaca_runtime": 29.6798, "eval_python_code_alpaca_samples_per_second": 16.846, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6448112801536849, "eval_python_code_alpaca_token_set_f1_sem": 0.013849839643803466, "eval_python_code_alpaca_token_set_precision": 0.6291901485117319, "eval_python_code_alpaca_token_set_recall": 0.6673540150014443, "eval_python_code_alpaca_true_num_tokens": 24.7890625, "step": 925000 }, { "epoch": 54.10305901620167, "eval_wikibio_accuracy": 0.372046875, "eval_wikibio_bleu_score": 6.9728090667513465, "eval_wikibio_bleu_score_sem": 0.7314671351075538, "eval_wikibio_emb_cos_sim": 0.5906095504760742, "eval_wikibio_emb_cos_sim_sem": 0.021641096100211143, "eval_wikibio_emb_top1_equal": 0.9375, "eval_wikibio_emb_top1_equal_sem": 0.02147948183119297, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.697439432144165, "eval_wikibio_n_ngrams_match_1": 14.75, "eval_wikibio_n_ngrams_match_2": 5.1796875, "eval_wikibio_n_ngrams_match_3": 2.171875, "eval_wikibio_num_pred_words": 54.03125, "eval_wikibio_num_true_words": 55.4296875, "eval_wikibio_perplexity": 14.84167990004502, "eval_wikibio_pred_num_tokens": 113.375, "eval_wikibio_rouge_score": 0.2797080992242567, "eval_wikibio_runtime": 30.3765, "eval_wikibio_samples_per_second": 16.46, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.30493313185872656, "eval_wikibio_token_set_f1_sem": 0.013421536827551326, "eval_wikibio_token_set_precision": 0.26666037850268576, "eval_wikibio_token_set_recall": 0.40672076615907266, "eval_wikibio_true_num_tokens": 104.2109375, "step": 925000 }, { "epoch": 54.10305901620167, "eval_msmarco_accuracy": 0.39446875, "eval_msmarco_bleu_score": 17.00613131193723, "eval_msmarco_bleu_score_sem": 1.4506952745025297, "eval_msmarco_emb_cos_sim": 0.8074957132339478, "eval_msmarco_emb_cos_sim_sem": 0.015362207777798176, "eval_msmarco_emb_top1_equal": 0.96875, "eval_msmarco_emb_top1_equal_sem": 0.01543935015797615, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.697805643081665, "eval_msmarco_n_ngrams_match_1": 27.3984375, "eval_msmarco_n_ngrams_match_2": 12.0078125, "eval_msmarco_n_ngrams_match_3": 6.5625, "eval_msmarco_num_pred_words": 61.1171875, "eval_msmarco_num_true_words": 60.3203125, "eval_msmarco_perplexity": 5.461948766844972, "eval_msmarco_pred_num_tokens": 82.625, "eval_msmarco_rouge_score": 0.4458034298885851, "eval_msmarco_runtime": 25.9549, "eval_msmarco_samples_per_second": 19.264, "eval_msmarco_steps_per_second": 0.039, "eval_msmarco_token_set_f1": 0.4778633942049382, "eval_msmarco_token_set_f1_sem": 0.014397106366622045, "eval_msmarco_token_set_precision": 0.44660126230093256, "eval_msmarco_token_set_recall": 0.5339783242053148, "eval_msmarco_true_num_tokens": 78.6171875, "step": 925000 }, { "epoch": 54.10890799555477, "grad_norm": 0.2402690052986145, "learning_rate": 0.001, "loss": 1.7951, "step": 925100 }, { "epoch": 54.11475697490788, "grad_norm": 0.20724375545978546, "learning_rate": 0.001, "loss": 1.7962, "step": 925200 }, { "epoch": 54.12060595426098, "grad_norm": 0.21638590097427368, "learning_rate": 0.001, "loss": 1.7944, "step": 925300 }, { "epoch": 54.126454933614085, "grad_norm": 0.18157635629177094, "learning_rate": 0.001, "loss": 1.7926, "step": 925400 }, { "epoch": 54.13230391296719, "grad_norm": 0.22767393290996552, "learning_rate": 0.001, "loss": 1.7956, "step": 925500 }, { "epoch": 54.13815289232029, "grad_norm": 0.18899688124656677, "learning_rate": 0.001, "loss": 1.7926, "step": 925600 }, { "epoch": 54.144001871673396, "grad_norm": 0.20948003232479095, "learning_rate": 0.001, "loss": 1.8, "step": 925700 }, { "epoch": 54.14985085102649, "grad_norm": 0.21542605757713318, "learning_rate": 0.001, "loss": 1.7948, "step": 925800 }, { "epoch": 54.1556998303796, "grad_norm": 0.20955583453178406, "learning_rate": 0.001, "loss": 1.7926, "step": 925900 }, { "epoch": 54.1615488097327, "grad_norm": 0.19208383560180664, "learning_rate": 0.001, "loss": 1.8001, "step": 926000 }, { "epoch": 54.167397789085804, "grad_norm": 0.20104333758354187, "learning_rate": 0.001, "loss": 1.7959, "step": 926100 }, { "epoch": 54.17324676843891, "grad_norm": 0.1795169562101364, "learning_rate": 0.001, "loss": 1.7993, "step": 926200 }, { "epoch": 54.17909574779201, "grad_norm": 0.21644702553749084, "learning_rate": 0.001, "loss": 1.796, "step": 926300 }, { "epoch": 54.184944727145115, "grad_norm": 0.20617446303367615, "learning_rate": 0.001, "loss": 1.7992, "step": 926400 }, { "epoch": 54.19079370649822, "grad_norm": 0.19070129096508026, "learning_rate": 0.001, "loss": 1.7931, "step": 926500 }, { "epoch": 54.196642685851316, "grad_norm": 0.19027864933013916, "learning_rate": 0.001, "loss": 1.7949, "step": 926600 }, { "epoch": 54.20249166520442, "grad_norm": 0.22891458868980408, "learning_rate": 0.001, "loss": 1.7978, "step": 926700 }, { "epoch": 54.20834064455752, "grad_norm": 0.18234877288341522, "learning_rate": 0.001, "loss": 1.7927, "step": 926800 }, { "epoch": 54.21418962391063, "grad_norm": 0.2266525775194168, "learning_rate": 0.001, "loss": 1.8019, "step": 926900 }, { "epoch": 54.22003860326373, "grad_norm": 0.22419224679470062, "learning_rate": 0.001, "loss": 1.7914, "step": 927000 }, { "epoch": 54.225887582616835, "grad_norm": 0.16524456441402435, "learning_rate": 0.001, "loss": 1.7973, "step": 927100 }, { "epoch": 54.23173656196994, "grad_norm": 0.21751318871974945, "learning_rate": 0.001, "loss": 1.801, "step": 927200 }, { "epoch": 54.23758554132304, "grad_norm": 0.1894235759973526, "learning_rate": 0.001, "loss": 1.7921, "step": 927300 }, { "epoch": 54.24343452067614, "grad_norm": 0.191563218832016, "learning_rate": 0.001, "loss": 1.8012, "step": 927400 }, { "epoch": 54.24928350002924, "grad_norm": 0.20462794601917267, "learning_rate": 0.001, "loss": 1.7998, "step": 927500 }, { "epoch": 54.25513247938235, "grad_norm": 0.20648619532585144, "learning_rate": 0.001, "loss": 1.7976, "step": 927600 }, { "epoch": 54.26098145873545, "grad_norm": 0.21402513980865479, "learning_rate": 0.001, "loss": 1.7964, "step": 927700 }, { "epoch": 54.266830438088554, "grad_norm": 0.18215849995613098, "learning_rate": 0.001, "loss": 1.7949, "step": 927800 }, { "epoch": 54.27267941744166, "grad_norm": 0.2137136608362198, "learning_rate": 0.001, "loss": 1.793, "step": 927900 }, { "epoch": 54.27852839679476, "grad_norm": 0.21312063932418823, "learning_rate": 0.001, "loss": 1.8016, "step": 928000 }, { "epoch": 54.28437737614786, "grad_norm": 0.22354084253311157, "learning_rate": 0.001, "loss": 1.798, "step": 928100 }, { "epoch": 54.29022635550096, "grad_norm": 0.20132556557655334, "learning_rate": 0.001, "loss": 1.7959, "step": 928200 }, { "epoch": 54.296075334854066, "grad_norm": 0.20998895168304443, "learning_rate": 0.001, "loss": 1.7958, "step": 928300 }, { "epoch": 54.30192431420717, "grad_norm": 0.19655878841876984, "learning_rate": 0.001, "loss": 1.7918, "step": 928400 }, { "epoch": 54.307773293560274, "grad_norm": 0.2537631690502167, "learning_rate": 0.001, "loss": 1.7953, "step": 928500 }, { "epoch": 54.31362227291338, "grad_norm": 0.19631238281726837, "learning_rate": 0.001, "loss": 1.7965, "step": 928600 }, { "epoch": 54.31947125226648, "grad_norm": 0.19812631607055664, "learning_rate": 0.001, "loss": 1.8024, "step": 928700 }, { "epoch": 54.325320231619585, "grad_norm": 0.26188793778419495, "learning_rate": 0.001, "loss": 1.8008, "step": 928800 }, { "epoch": 54.33116921097268, "grad_norm": 0.20662711560726166, "learning_rate": 0.001, "loss": 1.7983, "step": 928900 }, { "epoch": 54.337018190325786, "grad_norm": 0.1907813996076584, "learning_rate": 0.001, "loss": 1.7934, "step": 929000 }, { "epoch": 54.34286716967889, "grad_norm": 0.2343069314956665, "learning_rate": 0.001, "loss": 1.8009, "step": 929100 }, { "epoch": 54.34871614903199, "grad_norm": 0.21886172890663147, "learning_rate": 0.001, "loss": 1.8028, "step": 929200 }, { "epoch": 54.3545651283851, "grad_norm": 0.19824455678462982, "learning_rate": 0.001, "loss": 1.7971, "step": 929300 }, { "epoch": 54.3604141077382, "grad_norm": 0.20540140569210052, "learning_rate": 0.001, "loss": 1.7967, "step": 929400 }, { "epoch": 54.366263087091305, "grad_norm": 0.21587254106998444, "learning_rate": 0.001, "loss": 1.8014, "step": 929500 }, { "epoch": 54.37211206644441, "grad_norm": 0.20974037051200867, "learning_rate": 0.001, "loss": 1.7987, "step": 929600 }, { "epoch": 54.377961045797505, "grad_norm": 0.19687898457050323, "learning_rate": 0.001, "loss": 1.8006, "step": 929700 }, { "epoch": 54.38381002515061, "grad_norm": 0.18680031597614288, "learning_rate": 0.001, "loss": 1.7992, "step": 929800 }, { "epoch": 54.38965900450371, "grad_norm": 0.20297715067863464, "learning_rate": 0.001, "loss": 1.801, "step": 929900 }, { "epoch": 54.39550798385682, "grad_norm": 0.22291961312294006, "learning_rate": 0.001, "loss": 1.7893, "step": 930000 }, { "epoch": 54.40135696320992, "grad_norm": 0.2153765857219696, "learning_rate": 0.001, "loss": 1.7935, "step": 930100 }, { "epoch": 54.407205942563024, "grad_norm": 0.22716906666755676, "learning_rate": 0.001, "loss": 1.8052, "step": 930200 }, { "epoch": 54.41305492191613, "grad_norm": 0.21638424694538116, "learning_rate": 0.001, "loss": 1.8022, "step": 930300 }, { "epoch": 54.41890390126923, "grad_norm": 0.1996721774339676, "learning_rate": 0.001, "loss": 1.7934, "step": 930400 }, { "epoch": 54.42475288062233, "grad_norm": 0.2110423594713211, "learning_rate": 0.001, "loss": 1.7983, "step": 930500 }, { "epoch": 54.43060185997543, "grad_norm": 0.1944403350353241, "learning_rate": 0.001, "loss": 1.7993, "step": 930600 }, { "epoch": 54.436450839328536, "grad_norm": 0.18821585178375244, "learning_rate": 0.001, "loss": 1.7966, "step": 930700 }, { "epoch": 54.44229981868164, "grad_norm": 0.20424768328666687, "learning_rate": 0.001, "loss": 1.7992, "step": 930800 }, { "epoch": 54.448148798034744, "grad_norm": 0.234514057636261, "learning_rate": 0.001, "loss": 1.8073, "step": 930900 }, { "epoch": 54.45399777738785, "grad_norm": 0.22326894104480743, "learning_rate": 0.001, "loss": 1.8029, "step": 931000 }, { "epoch": 54.45984675674095, "grad_norm": 0.24163402616977692, "learning_rate": 0.001, "loss": 1.8038, "step": 931100 }, { "epoch": 54.46569573609405, "grad_norm": 0.28955167531967163, "learning_rate": 0.001, "loss": 1.8053, "step": 931200 }, { "epoch": 54.47154471544715, "grad_norm": 0.20532245934009552, "learning_rate": 0.001, "loss": 1.8034, "step": 931300 }, { "epoch": 54.477393694800256, "grad_norm": 0.17437395453453064, "learning_rate": 0.001, "loss": 1.8015, "step": 931400 }, { "epoch": 54.48324267415336, "grad_norm": 0.18663735687732697, "learning_rate": 0.001, "loss": 1.7984, "step": 931500 }, { "epoch": 54.48909165350646, "grad_norm": 0.19633400440216064, "learning_rate": 0.001, "loss": 1.7959, "step": 931600 }, { "epoch": 54.49494063285957, "grad_norm": 0.18355302512645721, "learning_rate": 0.001, "loss": 1.8072, "step": 931700 }, { "epoch": 54.50078961221267, "grad_norm": 0.20286360383033752, "learning_rate": 0.001, "loss": 1.801, "step": 931800 }, { "epoch": 54.506638591565775, "grad_norm": 0.18967431783676147, "learning_rate": 0.001, "loss": 1.7991, "step": 931900 }, { "epoch": 54.51248757091887, "grad_norm": 0.17447799444198608, "learning_rate": 0.001, "loss": 1.8008, "step": 932000 }, { "epoch": 54.518336550271975, "grad_norm": 0.2181803286075592, "learning_rate": 0.001, "loss": 1.8067, "step": 932100 }, { "epoch": 54.52418552962508, "grad_norm": 0.22994786500930786, "learning_rate": 0.001, "loss": 1.8014, "step": 932200 }, { "epoch": 54.53003450897818, "grad_norm": 0.21346376836299896, "learning_rate": 0.001, "loss": 1.7992, "step": 932300 }, { "epoch": 54.53588348833129, "grad_norm": 0.22012721002101898, "learning_rate": 0.001, "loss": 1.8019, "step": 932400 }, { "epoch": 54.54173246768439, "grad_norm": 0.20613786578178406, "learning_rate": 0.001, "loss": 1.8054, "step": 932500 }, { "epoch": 54.547581447037494, "grad_norm": 0.20801721513271332, "learning_rate": 0.001, "loss": 1.7992, "step": 932600 }, { "epoch": 54.5534304263906, "grad_norm": 0.2497728019952774, "learning_rate": 0.001, "loss": 1.7985, "step": 932700 }, { "epoch": 54.559279405743695, "grad_norm": 0.19551995396614075, "learning_rate": 0.001, "loss": 1.7991, "step": 932800 }, { "epoch": 54.5651283850968, "grad_norm": 0.1946900188922882, "learning_rate": 0.001, "loss": 1.7996, "step": 932900 }, { "epoch": 54.5709773644499, "grad_norm": 0.2447488158941269, "learning_rate": 0.001, "loss": 1.7999, "step": 933000 }, { "epoch": 54.576826343803006, "grad_norm": 0.2594253420829773, "learning_rate": 0.001, "loss": 1.8017, "step": 933100 }, { "epoch": 54.58267532315611, "grad_norm": 0.2269582450389862, "learning_rate": 0.001, "loss": 1.8044, "step": 933200 }, { "epoch": 54.588524302509214, "grad_norm": 0.29175421595573425, "learning_rate": 0.001, "loss": 1.8095, "step": 933300 }, { "epoch": 54.59437328186232, "grad_norm": 0.19593791663646698, "learning_rate": 0.001, "loss": 1.799, "step": 933400 }, { "epoch": 54.60022226121542, "grad_norm": 0.19145514070987701, "learning_rate": 0.001, "loss": 1.8026, "step": 933500 }, { "epoch": 54.60607124056852, "grad_norm": 0.19557003676891327, "learning_rate": 0.001, "loss": 1.8084, "step": 933600 }, { "epoch": 54.61192021992162, "grad_norm": 0.26105400919914246, "learning_rate": 0.001, "loss": 1.8104, "step": 933700 }, { "epoch": 54.617769199274726, "grad_norm": 0.17618229985237122, "learning_rate": 0.001, "loss": 1.8058, "step": 933800 }, { "epoch": 54.62361817862783, "grad_norm": 0.1849348396062851, "learning_rate": 0.001, "loss": 1.7985, "step": 933900 }, { "epoch": 54.62946715798093, "grad_norm": 0.23624549806118011, "learning_rate": 0.001, "loss": 1.7962, "step": 934000 }, { "epoch": 54.63531613733404, "grad_norm": 0.266053706407547, "learning_rate": 0.001, "loss": 1.7998, "step": 934100 }, { "epoch": 54.64116511668714, "grad_norm": 0.2081335335969925, "learning_rate": 0.001, "loss": 1.8026, "step": 934200 }, { "epoch": 54.64701409604024, "grad_norm": 0.2505969703197479, "learning_rate": 0.001, "loss": 1.802, "step": 934300 }, { "epoch": 54.65286307539334, "grad_norm": 0.1781625747680664, "learning_rate": 0.001, "loss": 1.8094, "step": 934400 }, { "epoch": 54.658712054746445, "grad_norm": 0.19971609115600586, "learning_rate": 0.001, "loss": 1.807, "step": 934500 }, { "epoch": 54.66456103409955, "grad_norm": 0.20394004881381989, "learning_rate": 0.001, "loss": 1.8025, "step": 934600 }, { "epoch": 54.67041001345265, "grad_norm": 0.20415307581424713, "learning_rate": 0.001, "loss": 1.8077, "step": 934700 }, { "epoch": 54.67625899280576, "grad_norm": 0.22840653359889984, "learning_rate": 0.001, "loss": 1.8045, "step": 934800 }, { "epoch": 54.68210797215886, "grad_norm": 0.2211020141839981, "learning_rate": 0.001, "loss": 1.8037, "step": 934900 }, { "epoch": 54.687956951511964, "grad_norm": 0.21002237498760223, "learning_rate": 0.001, "loss": 1.8045, "step": 935000 }, { "epoch": 54.69380593086506, "grad_norm": 0.22013214230537415, "learning_rate": 0.001, "loss": 1.8047, "step": 935100 }, { "epoch": 54.699654910218165, "grad_norm": 0.23295365273952484, "learning_rate": 0.001, "loss": 1.8058, "step": 935200 }, { "epoch": 54.70550388957127, "grad_norm": 0.24427838623523712, "learning_rate": 0.001, "loss": 1.8052, "step": 935300 }, { "epoch": 54.71135286892437, "grad_norm": 0.19650378823280334, "learning_rate": 0.001, "loss": 1.8009, "step": 935400 }, { "epoch": 54.717201848277476, "grad_norm": 0.1950773000717163, "learning_rate": 0.001, "loss": 1.8041, "step": 935500 }, { "epoch": 54.72305082763058, "grad_norm": 0.21055306494235992, "learning_rate": 0.001, "loss": 1.8044, "step": 935600 }, { "epoch": 54.728899806983684, "grad_norm": 0.21887138485908508, "learning_rate": 0.001, "loss": 1.8062, "step": 935700 }, { "epoch": 54.73474878633679, "grad_norm": 0.1897197663784027, "learning_rate": 0.001, "loss": 1.8036, "step": 935800 }, { "epoch": 54.740597765689884, "grad_norm": 0.19949468970298767, "learning_rate": 0.001, "loss": 1.8033, "step": 935900 }, { "epoch": 54.74644674504299, "grad_norm": 0.18980948626995087, "learning_rate": 0.001, "loss": 1.8053, "step": 936000 }, { "epoch": 54.75229572439609, "grad_norm": 0.22754885256290436, "learning_rate": 0.001, "loss": 1.8039, "step": 936100 }, { "epoch": 54.758144703749196, "grad_norm": 0.23176737129688263, "learning_rate": 0.001, "loss": 1.8035, "step": 936200 }, { "epoch": 54.7639936831023, "grad_norm": 0.2013046145439148, "learning_rate": 0.001, "loss": 1.8034, "step": 936300 }, { "epoch": 54.7698426624554, "grad_norm": 0.21650978922843933, "learning_rate": 0.001, "loss": 1.8043, "step": 936400 }, { "epoch": 54.77569164180851, "grad_norm": 0.24739260971546173, "learning_rate": 0.001, "loss": 1.8026, "step": 936500 }, { "epoch": 54.78154062116161, "grad_norm": 0.20434853434562683, "learning_rate": 0.001, "loss": 1.7991, "step": 936600 }, { "epoch": 54.78738960051471, "grad_norm": 0.22222769260406494, "learning_rate": 0.001, "loss": 1.8041, "step": 936700 }, { "epoch": 54.79323857986781, "grad_norm": 0.2824559211730957, "learning_rate": 0.001, "loss": 1.8034, "step": 936800 }, { "epoch": 54.799087559220915, "grad_norm": 0.1955493837594986, "learning_rate": 0.001, "loss": 1.8042, "step": 936900 }, { "epoch": 54.80493653857402, "grad_norm": 0.2147112786769867, "learning_rate": 0.001, "loss": 1.8019, "step": 937000 }, { "epoch": 54.81078551792712, "grad_norm": 0.23703168332576752, "learning_rate": 0.001, "loss": 1.8038, "step": 937100 }, { "epoch": 54.816634497280226, "grad_norm": 0.24704013764858246, "learning_rate": 0.001, "loss": 1.7998, "step": 937200 }, { "epoch": 54.82248347663333, "grad_norm": 0.21336981654167175, "learning_rate": 0.001, "loss": 1.8052, "step": 937300 }, { "epoch": 54.82833245598643, "grad_norm": 0.26270297169685364, "learning_rate": 0.001, "loss": 1.8057, "step": 937400 }, { "epoch": 54.83418143533953, "grad_norm": 0.28899186849594116, "learning_rate": 0.001, "loss": 1.8084, "step": 937500 }, { "epoch": 54.840030414692635, "grad_norm": 0.2483486831188202, "learning_rate": 0.001, "loss": 1.8016, "step": 937600 }, { "epoch": 54.84587939404574, "grad_norm": 0.21523398160934448, "learning_rate": 0.001, "loss": 1.7991, "step": 937700 }, { "epoch": 54.85172837339884, "grad_norm": 0.18895557522773743, "learning_rate": 0.001, "loss": 1.8075, "step": 937800 }, { "epoch": 54.857577352751946, "grad_norm": 0.22117853164672852, "learning_rate": 0.001, "loss": 1.8002, "step": 937900 }, { "epoch": 54.86342633210505, "grad_norm": 0.22266295552253723, "learning_rate": 0.001, "loss": 1.8081, "step": 938000 }, { "epoch": 54.86927531145815, "grad_norm": 0.20799954235553741, "learning_rate": 0.001, "loss": 1.8097, "step": 938100 }, { "epoch": 54.87512429081125, "grad_norm": 0.20694969594478607, "learning_rate": 0.001, "loss": 1.8046, "step": 938200 }, { "epoch": 54.880973270164354, "grad_norm": 0.20014236867427826, "learning_rate": 0.001, "loss": 1.7988, "step": 938300 }, { "epoch": 54.88682224951746, "grad_norm": 0.17776377499103546, "learning_rate": 0.001, "loss": 1.81, "step": 938400 }, { "epoch": 54.89267122887056, "grad_norm": 0.2085835188627243, "learning_rate": 0.001, "loss": 1.8086, "step": 938500 }, { "epoch": 54.898520208223665, "grad_norm": 0.19617608189582825, "learning_rate": 0.001, "loss": 1.8063, "step": 938600 }, { "epoch": 54.90436918757677, "grad_norm": 0.2359652817249298, "learning_rate": 0.001, "loss": 1.8, "step": 938700 }, { "epoch": 54.91021816692987, "grad_norm": 0.1907089799642563, "learning_rate": 0.001, "loss": 1.7969, "step": 938800 }, { "epoch": 54.91606714628298, "grad_norm": 0.2023703157901764, "learning_rate": 0.001, "loss": 1.8033, "step": 938900 }, { "epoch": 54.92191612563607, "grad_norm": 0.22710266709327698, "learning_rate": 0.001, "loss": 1.8008, "step": 939000 }, { "epoch": 54.92776510498918, "grad_norm": 0.1870999038219452, "learning_rate": 0.001, "loss": 1.8029, "step": 939100 }, { "epoch": 54.93361408434228, "grad_norm": 0.22685860097408295, "learning_rate": 0.001, "loss": 1.8089, "step": 939200 }, { "epoch": 54.939463063695385, "grad_norm": 0.2447783201932907, "learning_rate": 0.001, "loss": 1.8039, "step": 939300 }, { "epoch": 54.94531204304849, "grad_norm": 0.20739026367664337, "learning_rate": 0.001, "loss": 1.8061, "step": 939400 }, { "epoch": 54.95116102240159, "grad_norm": 0.221882626414299, "learning_rate": 0.001, "loss": 1.8023, "step": 939500 }, { "epoch": 54.957010001754696, "grad_norm": 0.2542356252670288, "learning_rate": 0.001, "loss": 1.8069, "step": 939600 }, { "epoch": 54.9628589811078, "grad_norm": 0.23909972608089447, "learning_rate": 0.001, "loss": 1.7996, "step": 939700 }, { "epoch": 54.9687079604609, "grad_norm": 0.2332993745803833, "learning_rate": 0.001, "loss": 1.8038, "step": 939800 }, { "epoch": 54.974556939814, "grad_norm": 0.1637314409017563, "learning_rate": 0.001, "loss": 1.8081, "step": 939900 }, { "epoch": 54.980405919167104, "grad_norm": 0.34487345814704895, "learning_rate": 0.001, "loss": 1.8114, "step": 940000 }, { "epoch": 54.98625489852021, "grad_norm": 0.24287034571170807, "learning_rate": 0.001, "loss": 1.8038, "step": 940100 }, { "epoch": 54.99210387787331, "grad_norm": 0.2444169670343399, "learning_rate": 0.001, "loss": 1.8117, "step": 940200 }, { "epoch": 54.997952857226416, "grad_norm": 0.2717137932777405, "learning_rate": 0.001, "loss": 1.8056, "step": 940300 }, { "epoch": 55.00380183657952, "grad_norm": 0.13366903364658356, "learning_rate": 0.001, "loss": 1.7935, "step": 940400 }, { "epoch": 55.009650815932616, "grad_norm": 0.14917632937431335, "learning_rate": 0.001, "loss": 1.7869, "step": 940500 }, { "epoch": 55.01549979528572, "grad_norm": 0.14837105572223663, "learning_rate": 0.001, "loss": 1.7919, "step": 940600 }, { "epoch": 55.021348774638824, "grad_norm": 0.18049277365207672, "learning_rate": 0.001, "loss": 1.7916, "step": 940700 }, { "epoch": 55.02719775399193, "grad_norm": 0.23420944809913635, "learning_rate": 0.001, "loss": 1.7984, "step": 940800 }, { "epoch": 55.03304673334503, "grad_norm": 0.32834017276763916, "learning_rate": 0.001, "loss": 1.7949, "step": 940900 }, { "epoch": 55.038895712698135, "grad_norm": 0.16351114213466644, "learning_rate": 0.001, "loss": 1.7929, "step": 941000 }, { "epoch": 55.04474469205124, "grad_norm": 0.15225309133529663, "learning_rate": 0.001, "loss": 1.7947, "step": 941100 }, { "epoch": 55.05059367140434, "grad_norm": 0.17377562820911407, "learning_rate": 0.001, "loss": 1.7908, "step": 941200 }, { "epoch": 55.05644265075744, "grad_norm": 0.15063805878162384, "learning_rate": 0.001, "loss": 1.7912, "step": 941300 }, { "epoch": 55.06229163011054, "grad_norm": 0.13445419073104858, "learning_rate": 0.001, "loss": 1.7962, "step": 941400 }, { "epoch": 55.06814060946365, "grad_norm": 0.21426348388195038, "learning_rate": 0.001, "loss": 1.7972, "step": 941500 }, { "epoch": 55.07398958881675, "grad_norm": 0.1480133831501007, "learning_rate": 0.001, "loss": 1.7911, "step": 941600 }, { "epoch": 55.079838568169855, "grad_norm": 0.1917673796415329, "learning_rate": 0.001, "loss": 1.7951, "step": 941700 }, { "epoch": 55.08568754752296, "grad_norm": 0.1666545867919922, "learning_rate": 0.001, "loss": 1.7907, "step": 941800 }, { "epoch": 55.09153652687606, "grad_norm": 0.23460383713245392, "learning_rate": 0.001, "loss": 1.7909, "step": 941900 }, { "epoch": 55.097385506229166, "grad_norm": 0.18787896633148193, "learning_rate": 0.001, "loss": 1.7927, "step": 942000 }, { "epoch": 55.10323448558226, "grad_norm": 0.22865727543830872, "learning_rate": 0.001, "loss": 1.7954, "step": 942100 }, { "epoch": 55.10908346493537, "grad_norm": 0.15991096198558807, "learning_rate": 0.001, "loss": 1.7917, "step": 942200 }, { "epoch": 55.11493244428847, "grad_norm": 0.23896096646785736, "learning_rate": 0.001, "loss": 1.7989, "step": 942300 }, { "epoch": 55.120781423641574, "grad_norm": 0.16402025520801544, "learning_rate": 0.001, "loss": 1.7978, "step": 942400 }, { "epoch": 55.12663040299468, "grad_norm": 0.17636342346668243, "learning_rate": 0.001, "loss": 1.7919, "step": 942500 }, { "epoch": 55.13247938234778, "grad_norm": 0.18808773159980774, "learning_rate": 0.001, "loss": 1.7938, "step": 942600 }, { "epoch": 55.138328361700886, "grad_norm": 0.13497181236743927, "learning_rate": 0.001, "loss": 1.7958, "step": 942700 }, { "epoch": 55.14417734105399, "grad_norm": 0.1590566188097, "learning_rate": 0.001, "loss": 1.7896, "step": 942800 }, { "epoch": 55.150026320407086, "grad_norm": 0.13338416814804077, "learning_rate": 0.001, "loss": 1.7885, "step": 942900 }, { "epoch": 55.15587529976019, "grad_norm": 0.16516511142253876, "learning_rate": 0.001, "loss": 1.7923, "step": 943000 }, { "epoch": 55.161724279113294, "grad_norm": 0.19882529973983765, "learning_rate": 0.001, "loss": 1.7934, "step": 943100 }, { "epoch": 55.1675732584664, "grad_norm": 0.15290555357933044, "learning_rate": 0.001, "loss": 1.7931, "step": 943200 }, { "epoch": 55.1734222378195, "grad_norm": 0.18029005825519562, "learning_rate": 0.001, "loss": 1.7914, "step": 943300 }, { "epoch": 55.179271217172605, "grad_norm": 0.12793909013271332, "learning_rate": 0.001, "loss": 1.7946, "step": 943400 }, { "epoch": 55.18512019652571, "grad_norm": 0.20331920683383942, "learning_rate": 0.001, "loss": 1.7933, "step": 943500 }, { "epoch": 55.190969175878806, "grad_norm": 0.13330084085464478, "learning_rate": 0.001, "loss": 1.7963, "step": 943600 }, { "epoch": 55.19681815523191, "grad_norm": 0.17669522762298584, "learning_rate": 0.001, "loss": 1.7917, "step": 943700 }, { "epoch": 55.20266713458501, "grad_norm": 0.17447826266288757, "learning_rate": 0.001, "loss": 1.7975, "step": 943800 }, { "epoch": 55.20851611393812, "grad_norm": 0.1574823409318924, "learning_rate": 0.001, "loss": 1.7913, "step": 943900 }, { "epoch": 55.21436509329122, "grad_norm": 0.22288230061531067, "learning_rate": 0.001, "loss": 1.7946, "step": 944000 }, { "epoch": 55.220214072644325, "grad_norm": 0.1472679078578949, "learning_rate": 0.001, "loss": 1.8051, "step": 944100 }, { "epoch": 55.22606305199743, "grad_norm": 0.22937317192554474, "learning_rate": 0.001, "loss": 1.7949, "step": 944200 }, { "epoch": 55.23191203135053, "grad_norm": 0.18185845017433167, "learning_rate": 0.001, "loss": 1.7977, "step": 944300 }, { "epoch": 55.23776101070363, "grad_norm": 0.19099874794483185, "learning_rate": 0.001, "loss": 1.7889, "step": 944400 }, { "epoch": 55.24360999005673, "grad_norm": 0.18346968293190002, "learning_rate": 0.001, "loss": 1.7974, "step": 944500 }, { "epoch": 55.24945896940984, "grad_norm": 0.1992785632610321, "learning_rate": 0.001, "loss": 1.7952, "step": 944600 }, { "epoch": 55.25530794876294, "grad_norm": 0.1622162014245987, "learning_rate": 0.001, "loss": 1.796, "step": 944700 }, { "epoch": 55.261156928116044, "grad_norm": 0.18205010890960693, "learning_rate": 0.001, "loss": 1.7974, "step": 944800 }, { "epoch": 55.26700590746915, "grad_norm": 0.18512240052223206, "learning_rate": 0.001, "loss": 1.7965, "step": 944900 }, { "epoch": 55.27285488682225, "grad_norm": 0.2008340209722519, "learning_rate": 0.001, "loss": 1.7964, "step": 945000 }, { "epoch": 55.278703866175356, "grad_norm": 0.20521089434623718, "learning_rate": 0.001, "loss": 1.7938, "step": 945100 }, { "epoch": 55.28455284552845, "grad_norm": 0.31995654106140137, "learning_rate": 0.001, "loss": 1.7974, "step": 945200 }, { "epoch": 55.290401824881556, "grad_norm": 0.16057907044887543, "learning_rate": 0.001, "loss": 1.798, "step": 945300 }, { "epoch": 55.29625080423466, "grad_norm": 0.18269145488739014, "learning_rate": 0.001, "loss": 1.7957, "step": 945400 }, { "epoch": 55.302099783587764, "grad_norm": 0.21816971898078918, "learning_rate": 0.001, "loss": 1.7971, "step": 945500 }, { "epoch": 55.30794876294087, "grad_norm": 0.17428189516067505, "learning_rate": 0.001, "loss": 1.7951, "step": 945600 }, { "epoch": 55.31379774229397, "grad_norm": 0.15170148015022278, "learning_rate": 0.001, "loss": 1.8018, "step": 945700 }, { "epoch": 55.319646721647075, "grad_norm": 0.17225706577301025, "learning_rate": 0.001, "loss": 1.7881, "step": 945800 }, { "epoch": 55.32549570100018, "grad_norm": 0.1798749715089798, "learning_rate": 0.001, "loss": 1.7907, "step": 945900 }, { "epoch": 55.331344680353276, "grad_norm": 0.1896384060382843, "learning_rate": 0.001, "loss": 1.7962, "step": 946000 }, { "epoch": 55.33719365970638, "grad_norm": 0.15175433456897736, "learning_rate": 0.001, "loss": 1.7978, "step": 946100 }, { "epoch": 55.34304263905948, "grad_norm": 0.16845041513442993, "learning_rate": 0.001, "loss": 1.7993, "step": 946200 }, { "epoch": 55.34889161841259, "grad_norm": 0.1692623645067215, "learning_rate": 0.001, "loss": 1.7955, "step": 946300 }, { "epoch": 55.35474059776569, "grad_norm": 0.15366394817829132, "learning_rate": 0.001, "loss": 1.791, "step": 946400 }, { "epoch": 55.360589577118795, "grad_norm": 0.173135906457901, "learning_rate": 0.001, "loss": 1.8004, "step": 946500 }, { "epoch": 55.3664385564719, "grad_norm": 0.18812717497348785, "learning_rate": 0.001, "loss": 1.7955, "step": 946600 }, { "epoch": 55.372287535824995, "grad_norm": 0.20147261023521423, "learning_rate": 0.001, "loss": 1.8002, "step": 946700 }, { "epoch": 55.3781365151781, "grad_norm": 0.15443004667758942, "learning_rate": 0.001, "loss": 1.7957, "step": 946800 }, { "epoch": 55.3839854945312, "grad_norm": 0.23334915935993195, "learning_rate": 0.001, "loss": 1.8042, "step": 946900 }, { "epoch": 55.38983447388431, "grad_norm": 0.16006740927696228, "learning_rate": 0.001, "loss": 1.8027, "step": 947000 }, { "epoch": 55.39568345323741, "grad_norm": 0.16748902201652527, "learning_rate": 0.001, "loss": 1.7942, "step": 947100 }, { "epoch": 55.401532432590514, "grad_norm": 0.18583311140537262, "learning_rate": 0.001, "loss": 1.7995, "step": 947200 }, { "epoch": 55.40738141194362, "grad_norm": 0.1866573989391327, "learning_rate": 0.001, "loss": 1.7974, "step": 947300 }, { "epoch": 55.41323039129672, "grad_norm": 0.20587439835071564, "learning_rate": 0.001, "loss": 1.8017, "step": 947400 }, { "epoch": 55.41907937064982, "grad_norm": 0.1488236039876938, "learning_rate": 0.001, "loss": 1.8047, "step": 947500 }, { "epoch": 55.42492835000292, "grad_norm": 0.13180476427078247, "learning_rate": 0.001, "loss": 1.7926, "step": 947600 }, { "epoch": 55.430777329356026, "grad_norm": 0.25271421670913696, "learning_rate": 0.001, "loss": 1.7957, "step": 947700 }, { "epoch": 55.43662630870913, "grad_norm": 0.2538442313671112, "learning_rate": 0.001, "loss": 1.8065, "step": 947800 }, { "epoch": 55.442475288062234, "grad_norm": 0.16073711216449738, "learning_rate": 0.001, "loss": 1.797, "step": 947900 }, { "epoch": 55.44832426741534, "grad_norm": 0.22041213512420654, "learning_rate": 0.001, "loss": 1.8002, "step": 948000 }, { "epoch": 55.45417324676844, "grad_norm": 0.1661258339881897, "learning_rate": 0.001, "loss": 1.8028, "step": 948100 }, { "epoch": 55.460022226121545, "grad_norm": 0.24081550538539886, "learning_rate": 0.001, "loss": 1.7979, "step": 948200 }, { "epoch": 55.46587120547464, "grad_norm": 0.13215744495391846, "learning_rate": 0.001, "loss": 1.804, "step": 948300 }, { "epoch": 55.471720184827745, "grad_norm": 0.21078984439373016, "learning_rate": 0.001, "loss": 1.7938, "step": 948400 }, { "epoch": 55.47756916418085, "grad_norm": 0.19541041553020477, "learning_rate": 0.001, "loss": 1.7977, "step": 948500 }, { "epoch": 55.48341814353395, "grad_norm": 0.16680841147899628, "learning_rate": 0.001, "loss": 1.7953, "step": 948600 }, { "epoch": 55.48926712288706, "grad_norm": 0.2244032472372055, "learning_rate": 0.001, "loss": 1.8028, "step": 948700 }, { "epoch": 55.49511610224016, "grad_norm": 0.3299770951271057, "learning_rate": 0.001, "loss": 1.798, "step": 948800 }, { "epoch": 55.500965081593264, "grad_norm": 0.1578642725944519, "learning_rate": 0.001, "loss": 1.7994, "step": 948900 }, { "epoch": 55.50681406094637, "grad_norm": 0.18765492737293243, "learning_rate": 0.001, "loss": 1.8011, "step": 949000 }, { "epoch": 55.512663040299465, "grad_norm": 0.1292874813079834, "learning_rate": 0.001, "loss": 1.7977, "step": 949100 }, { "epoch": 55.51851201965257, "grad_norm": 0.17036449909210205, "learning_rate": 0.001, "loss": 1.8024, "step": 949200 }, { "epoch": 55.52436099900567, "grad_norm": 0.13379880785942078, "learning_rate": 0.001, "loss": 1.7964, "step": 949300 }, { "epoch": 55.530209978358776, "grad_norm": 0.14012178778648376, "learning_rate": 0.001, "loss": 1.7969, "step": 949400 }, { "epoch": 55.53605895771188, "grad_norm": 0.27562782168388367, "learning_rate": 0.001, "loss": 1.8021, "step": 949500 }, { "epoch": 55.541907937064984, "grad_norm": 0.22343939542770386, "learning_rate": 0.001, "loss": 1.8003, "step": 949600 }, { "epoch": 55.54775691641809, "grad_norm": 0.14795814454555511, "learning_rate": 0.001, "loss": 1.8019, "step": 949700 }, { "epoch": 55.553605895771184, "grad_norm": 0.16827571392059326, "learning_rate": 0.001, "loss": 1.8026, "step": 949800 }, { "epoch": 55.55945487512429, "grad_norm": 0.152627632021904, "learning_rate": 0.001, "loss": 1.8045, "step": 949900 }, { "epoch": 55.56530385447739, "grad_norm": 0.27430352568626404, "learning_rate": 0.001, "loss": 1.8003, "step": 950000 }, { "epoch": 55.56530385447739, "eval_ag_news_accuracy": 0.246015625, "eval_ag_news_bleu_score": 7.618503673197539, "eval_ag_news_bleu_score_sem": 0.48905344449803634, "eval_ag_news_emb_cos_sim": 0.7262816429138184, "eval_ag_news_emb_cos_sim_sem": 0.01315644197165966, "eval_ag_news_emb_top1_equal": 0.9765625, "eval_ag_news_emb_top1_equal_sem": 0.013424675911664963, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.66261887550354, "eval_ag_news_n_ngrams_match_1": 14.2734375, "eval_ag_news_n_ngrams_match_2": 4.34375, "eval_ag_news_n_ngrams_match_3": 1.7734375, "eval_ag_news_num_pred_words": 45.9375, "eval_ag_news_num_true_words": 45.6171875, "eval_ag_news_perplexity": 14.333778368428046, "eval_ag_news_pred_num_tokens": 67.921875, "eval_ag_news_rouge_score": 0.29751097272792526, "eval_ag_news_runtime": 37.6852, "eval_ag_news_samples_per_second": 13.268, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3340971767740392, "eval_ag_news_token_set_f1_sem": 0.010097663266781548, "eval_ag_news_token_set_precision": 0.3138275567136982, "eval_ag_news_token_set_recall": 0.37024598923112473, "eval_ag_news_true_num_tokens": 63.2578125, "step": 950000 }, { "epoch": 55.56530385447739, "eval_anthropic_toxic_prompts_accuracy": 0.102484375, "eval_anthropic_toxic_prompts_bleu_score": 43.13959854249079, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.8073253855460414, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8805189728736877, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.011656020767986774, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.984375, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.011004959233105183, "eval_anthropic_toxic_prompts_exact_match": 0.1484375, "eval_anthropic_toxic_prompts_exact_match_sem": 0.0315484639796987, "eval_anthropic_toxic_prompts_loss": 1.2679634094238281, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.3515625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.8359375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.8203125, "eval_anthropic_toxic_prompts_num_pred_words": 14.9296875, "eval_anthropic_toxic_prompts_num_true_words": 14.8515625, "eval_anthropic_toxic_prompts_perplexity": 3.5536079432494323, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.25, "eval_anthropic_toxic_prompts_rouge_score": 0.6881137738602826, "eval_anthropic_toxic_prompts_runtime": 30.4185, "eval_anthropic_toxic_prompts_samples_per_second": 16.437, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.7056004241674331, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.020935769411083906, "eval_anthropic_toxic_prompts_token_set_precision": 0.702776096389488, "eval_anthropic_toxic_prompts_token_set_recall": 0.7149582895853394, "eval_anthropic_toxic_prompts_true_num_tokens": 18.0390625, "step": 950000 }, { "epoch": 55.56530385447739, "eval_arxiv_accuracy": 0.37778125, "eval_arxiv_bleu_score": 1.8573782985624252, "eval_arxiv_bleu_score_sem": 0.13562783791440058, "eval_arxiv_emb_cos_sim": 0.52301424741745, "eval_arxiv_emb_cos_sim_sem": 0.018553351983428, "eval_arxiv_emb_top1_equal": 0.9375, "eval_arxiv_emb_top1_equal_sem": 0.02147948183119297, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.426928758621216, "eval_arxiv_n_ngrams_match_1": 14.984375, "eval_arxiv_n_ngrams_match_2": 2.6953125, "eval_arxiv_n_ngrams_match_3": 0.5546875, "eval_arxiv_num_pred_words": 57.84375, "eval_arxiv_num_true_words": 86.5703125, "eval_arxiv_perplexity": 30.781958600191068, "eval_arxiv_pred_num_tokens": 125.3046875, "eval_arxiv_rouge_score": 0.19825704800767488, "eval_arxiv_runtime": 31.7179, "eval_arxiv_samples_per_second": 15.764, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.19772724132622735, "eval_arxiv_token_set_f1_sem": 0.007009098337139448, "eval_arxiv_token_set_precision": 0.13513252114701307, "eval_arxiv_token_set_recall": 0.4292218043968429, "eval_arxiv_true_num_tokens": 125.171875, "step": 950000 }, { "epoch": 55.56530385447739, "eval_python_code_alpaca_accuracy": 0.13334375, "eval_python_code_alpaca_bleu_score": 27.90619481419835, "eval_python_code_alpaca_bleu_score_sem": 1.5944105311399137, "eval_python_code_alpaca_emb_cos_sim": 0.8822159171104431, "eval_python_code_alpaca_emb_cos_sim_sem": 0.007952651008963585, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.5086820125579834, "eval_python_code_alpaca_n_ngrams_match_1": 9.7109375, "eval_python_code_alpaca_n_ngrams_match_2": 5.28125, "eval_python_code_alpaca_n_ngrams_match_3": 3.0, "eval_python_code_alpaca_num_pred_words": 16.6171875, "eval_python_code_alpaca_num_true_words": 17.1640625, "eval_python_code_alpaca_perplexity": 4.520768549918656, "eval_python_code_alpaca_pred_num_tokens": 22.3671875, "eval_python_code_alpaca_rouge_score": 0.5952201460831148, "eval_python_code_alpaca_runtime": 30.0527, "eval_python_code_alpaca_samples_per_second": 16.637, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6116641564871754, "eval_python_code_alpaca_token_set_f1_sem": 0.013637106203108141, "eval_python_code_alpaca_token_set_precision": 0.5994387994145729, "eval_python_code_alpaca_token_set_recall": 0.6299122806104133, "eval_python_code_alpaca_true_num_tokens": 22.578125, "step": 950000 }, { "epoch": 55.56530385447739, "eval_wikibio_accuracy": 0.366203125, "eval_wikibio_bleu_score": 7.250345355509859, "eval_wikibio_bleu_score_sem": 0.5835420063961284, "eval_wikibio_emb_cos_sim": 0.6421390175819397, "eval_wikibio_emb_cos_sim_sem": 0.021739045158028603, "eval_wikibio_emb_top1_equal": 0.96875, "eval_wikibio_emb_top1_equal_sem": 0.01543935015797615, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6955714225769043, "eval_wikibio_n_ngrams_match_1": 16.7265625, "eval_wikibio_n_ngrams_match_2": 5.625, "eval_wikibio_n_ngrams_match_3": 2.109375, "eval_wikibio_num_pred_words": 55.375, "eval_wikibio_num_true_words": 54.84375, "eval_wikibio_perplexity": 14.813981378603446, "eval_wikibio_pred_num_tokens": 104.3828125, "eval_wikibio_rouge_score": 0.3166080399712181, "eval_wikibio_runtime": 31.4879, "eval_wikibio_samples_per_second": 15.879, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32419180129193964, "eval_wikibio_token_set_f1_sem": 0.010130465569281353, "eval_wikibio_token_set_precision": 0.2907503772693496, "eval_wikibio_token_set_recall": 0.39712807235464415, "eval_wikibio_true_num_tokens": 103.6484375, "step": 950000 }, { "epoch": 55.56530385447739, "eval_msmarco_accuracy": 0.3948125, "eval_msmarco_bleu_score": 18.58278647257312, "eval_msmarco_bleu_score_sem": 1.6796052229275582, "eval_msmarco_emb_cos_sim": 0.7909826040267944, "eval_msmarco_emb_cos_sim_sem": 0.016550486907362938, "eval_msmarco_emb_top1_equal": 0.96875, "eval_msmarco_emb_top1_equal_sem": 0.01543935015797615, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.765411138534546, "eval_msmarco_n_ngrams_match_1": 29.0625, "eval_msmarco_n_ngrams_match_2": 13.71875, "eval_msmarco_n_ngrams_match_3": 8.140625, "eval_msmarco_num_pred_words": 60.2421875, "eval_msmarco_num_true_words": 62.65625, "eval_msmarco_perplexity": 5.8439745473671945, "eval_msmarco_pred_num_tokens": 81.15625, "eval_msmarco_rouge_score": 0.4637009110989646, "eval_msmarco_runtime": 27.1089, "eval_msmarco_samples_per_second": 18.444, "eval_msmarco_steps_per_second": 0.037, "eval_msmarco_token_set_f1": 0.4810139572983357, "eval_msmarco_token_set_f1_sem": 0.015147798402942116, "eval_msmarco_token_set_precision": 0.44502532466432776, "eval_msmarco_token_set_recall": 0.542955515141241, "eval_msmarco_true_num_tokens": 80.5078125, "step": 950000 }, { "epoch": 55.571152833830496, "grad_norm": 0.2631101906299591, "learning_rate": 0.001, "loss": 1.7984, "step": 950100 }, { "epoch": 55.5770018131836, "grad_norm": 0.1399993747472763, "learning_rate": 0.001, "loss": 1.8005, "step": 950200 }, { "epoch": 55.5828507925367, "grad_norm": 0.2458731234073639, "learning_rate": 0.001, "loss": 1.7959, "step": 950300 }, { "epoch": 55.58869977188981, "grad_norm": 0.21549983322620392, "learning_rate": 0.001, "loss": 1.8022, "step": 950400 }, { "epoch": 55.59454875124291, "grad_norm": 0.1597636342048645, "learning_rate": 0.001, "loss": 1.8034, "step": 950500 }, { "epoch": 55.60039773059601, "grad_norm": 0.18429140746593475, "learning_rate": 0.001, "loss": 1.7991, "step": 950600 }, { "epoch": 55.60624670994911, "grad_norm": 0.19281695783138275, "learning_rate": 0.001, "loss": 1.8072, "step": 950700 }, { "epoch": 55.612095689302215, "grad_norm": 0.1510644257068634, "learning_rate": 0.001, "loss": 1.7979, "step": 950800 }, { "epoch": 55.61794466865532, "grad_norm": 0.1470826119184494, "learning_rate": 0.001, "loss": 1.8051, "step": 950900 }, { "epoch": 55.62379364800842, "grad_norm": 0.16210970282554626, "learning_rate": 0.001, "loss": 1.8052, "step": 951000 }, { "epoch": 55.62964262736153, "grad_norm": 0.17254574596881866, "learning_rate": 0.001, "loss": 1.8019, "step": 951100 }, { "epoch": 55.63549160671463, "grad_norm": 0.12042746692895889, "learning_rate": 0.001, "loss": 1.7978, "step": 951200 }, { "epoch": 55.641340586067734, "grad_norm": 0.20106911659240723, "learning_rate": 0.001, "loss": 1.7962, "step": 951300 }, { "epoch": 55.64718956542083, "grad_norm": 0.16815394163131714, "learning_rate": 0.001, "loss": 1.7972, "step": 951400 }, { "epoch": 55.653038544773935, "grad_norm": 0.17008669674396515, "learning_rate": 0.001, "loss": 1.8019, "step": 951500 }, { "epoch": 55.65888752412704, "grad_norm": 0.15946443378925323, "learning_rate": 0.001, "loss": 1.7951, "step": 951600 }, { "epoch": 55.66473650348014, "grad_norm": 0.198915034532547, "learning_rate": 0.001, "loss": 1.7966, "step": 951700 }, { "epoch": 55.670585482833246, "grad_norm": 0.1477261185646057, "learning_rate": 0.001, "loss": 1.7987, "step": 951800 }, { "epoch": 55.67643446218635, "grad_norm": 0.21652305126190186, "learning_rate": 0.001, "loss": 1.7975, "step": 951900 }, { "epoch": 55.682283441539454, "grad_norm": 0.22564861178398132, "learning_rate": 0.001, "loss": 1.8006, "step": 952000 }, { "epoch": 55.68813242089256, "grad_norm": 0.12736451625823975, "learning_rate": 0.001, "loss": 1.8004, "step": 952100 }, { "epoch": 55.693981400245654, "grad_norm": 0.17821979522705078, "learning_rate": 0.001, "loss": 1.7958, "step": 952200 }, { "epoch": 55.69983037959876, "grad_norm": 0.13858206570148468, "learning_rate": 0.001, "loss": 1.7988, "step": 952300 }, { "epoch": 55.70567935895186, "grad_norm": 0.17056609690189362, "learning_rate": 0.001, "loss": 1.7971, "step": 952400 }, { "epoch": 55.711528338304966, "grad_norm": 0.1976151317358017, "learning_rate": 0.001, "loss": 1.7976, "step": 952500 }, { "epoch": 55.71737731765807, "grad_norm": 0.1715439409017563, "learning_rate": 0.001, "loss": 1.7997, "step": 952600 }, { "epoch": 55.72322629701117, "grad_norm": 0.1692531555891037, "learning_rate": 0.001, "loss": 1.8033, "step": 952700 }, { "epoch": 55.72907527636428, "grad_norm": 0.2514359951019287, "learning_rate": 0.001, "loss": 1.7954, "step": 952800 }, { "epoch": 55.734924255717374, "grad_norm": 0.1476997435092926, "learning_rate": 0.001, "loss": 1.7987, "step": 952900 }, { "epoch": 55.74077323507048, "grad_norm": 0.18567171692848206, "learning_rate": 0.001, "loss": 1.8016, "step": 953000 }, { "epoch": 55.74662221442358, "grad_norm": 0.17577312886714935, "learning_rate": 0.001, "loss": 1.8069, "step": 953100 }, { "epoch": 55.752471193776685, "grad_norm": 0.2191869467496872, "learning_rate": 0.001, "loss": 1.7955, "step": 953200 }, { "epoch": 55.75832017312979, "grad_norm": 0.14486238360404968, "learning_rate": 0.001, "loss": 1.8018, "step": 953300 }, { "epoch": 55.76416915248289, "grad_norm": 0.16333670914173126, "learning_rate": 0.001, "loss": 1.8018, "step": 953400 }, { "epoch": 55.770018131836, "grad_norm": 0.20151737332344055, "learning_rate": 0.001, "loss": 1.8029, "step": 953500 }, { "epoch": 55.7758671111891, "grad_norm": 0.23138757050037384, "learning_rate": 0.001, "loss": 1.7972, "step": 953600 }, { "epoch": 55.7817160905422, "grad_norm": 0.15080669522285461, "learning_rate": 0.001, "loss": 1.8044, "step": 953700 }, { "epoch": 55.7875650698953, "grad_norm": 0.15957596898078918, "learning_rate": 0.001, "loss": 1.7952, "step": 953800 }, { "epoch": 55.793414049248405, "grad_norm": 0.13961933553218842, "learning_rate": 0.001, "loss": 1.8013, "step": 953900 }, { "epoch": 55.79926302860151, "grad_norm": 0.17235037684440613, "learning_rate": 0.001, "loss": 1.8003, "step": 954000 }, { "epoch": 55.80511200795461, "grad_norm": 0.18258340656757355, "learning_rate": 0.001, "loss": 1.796, "step": 954100 }, { "epoch": 55.810960987307716, "grad_norm": 0.1939040869474411, "learning_rate": 0.001, "loss": 1.8064, "step": 954200 }, { "epoch": 55.81680996666082, "grad_norm": 0.1271142214536667, "learning_rate": 0.001, "loss": 1.8031, "step": 954300 }, { "epoch": 55.822658946013924, "grad_norm": 0.17654243111610413, "learning_rate": 0.001, "loss": 1.8036, "step": 954400 }, { "epoch": 55.82850792536702, "grad_norm": 0.24292464554309845, "learning_rate": 0.001, "loss": 1.805, "step": 954500 }, { "epoch": 55.834356904720124, "grad_norm": 0.23255331814289093, "learning_rate": 0.001, "loss": 1.8006, "step": 954600 }, { "epoch": 55.84020588407323, "grad_norm": 0.15771357715129852, "learning_rate": 0.001, "loss": 1.7971, "step": 954700 }, { "epoch": 55.84605486342633, "grad_norm": 0.18995195627212524, "learning_rate": 0.001, "loss": 1.8035, "step": 954800 }, { "epoch": 55.851903842779436, "grad_norm": 0.140130877494812, "learning_rate": 0.001, "loss": 1.802, "step": 954900 }, { "epoch": 55.85775282213254, "grad_norm": 0.2459365725517273, "learning_rate": 0.001, "loss": 1.7995, "step": 955000 }, { "epoch": 55.86360180148564, "grad_norm": 0.18191742897033691, "learning_rate": 0.001, "loss": 1.802, "step": 955100 }, { "epoch": 55.86945078083875, "grad_norm": 0.24377579987049103, "learning_rate": 0.001, "loss": 1.8076, "step": 955200 }, { "epoch": 55.875299760191844, "grad_norm": 0.19012051820755005, "learning_rate": 0.001, "loss": 1.7989, "step": 955300 }, { "epoch": 55.88114873954495, "grad_norm": 0.34474101662635803, "learning_rate": 0.001, "loss": 1.8054, "step": 955400 }, { "epoch": 55.88699771889805, "grad_norm": 0.28447794914245605, "learning_rate": 0.001, "loss": 1.803, "step": 955500 }, { "epoch": 55.892846698251155, "grad_norm": 0.24453844130039215, "learning_rate": 0.001, "loss": 1.8077, "step": 955600 }, { "epoch": 55.89869567760426, "grad_norm": 0.22624926269054413, "learning_rate": 0.001, "loss": 1.801, "step": 955700 }, { "epoch": 55.90454465695736, "grad_norm": 0.16559170186519623, "learning_rate": 0.001, "loss": 1.8025, "step": 955800 }, { "epoch": 55.91039363631047, "grad_norm": 0.13075321912765503, "learning_rate": 0.001, "loss": 1.8005, "step": 955900 }, { "epoch": 55.91624261566356, "grad_norm": 0.18869516253471375, "learning_rate": 0.001, "loss": 1.7976, "step": 956000 }, { "epoch": 55.92209159501667, "grad_norm": 0.18220014870166779, "learning_rate": 0.001, "loss": 1.7998, "step": 956100 }, { "epoch": 55.92794057436977, "grad_norm": 0.16071262955665588, "learning_rate": 0.001, "loss": 1.807, "step": 956200 }, { "epoch": 55.933789553722875, "grad_norm": 0.17203351855278015, "learning_rate": 0.001, "loss": 1.8002, "step": 956300 }, { "epoch": 55.93963853307598, "grad_norm": 0.20367009937763214, "learning_rate": 0.001, "loss": 1.8021, "step": 956400 }, { "epoch": 55.94548751242908, "grad_norm": 0.264997661113739, "learning_rate": 0.001, "loss": 1.8005, "step": 956500 }, { "epoch": 55.951336491782186, "grad_norm": 0.16039946675300598, "learning_rate": 0.001, "loss": 1.805, "step": 956600 }, { "epoch": 55.95718547113529, "grad_norm": 0.20552058517932892, "learning_rate": 0.001, "loss": 1.8108, "step": 956700 }, { "epoch": 55.96303445048839, "grad_norm": 0.2843647003173828, "learning_rate": 0.001, "loss": 1.8047, "step": 956800 }, { "epoch": 55.96888342984149, "grad_norm": 0.20352232456207275, "learning_rate": 0.001, "loss": 1.8096, "step": 956900 }, { "epoch": 55.974732409194594, "grad_norm": 0.1571057140827179, "learning_rate": 0.001, "loss": 1.8044, "step": 957000 }, { "epoch": 55.9805813885477, "grad_norm": 0.19952411949634552, "learning_rate": 0.001, "loss": 1.7978, "step": 957100 }, { "epoch": 55.9864303679008, "grad_norm": 0.19047978520393372, "learning_rate": 0.001, "loss": 1.8019, "step": 957200 }, { "epoch": 55.992279347253906, "grad_norm": 0.16777403652668, "learning_rate": 0.001, "loss": 1.8037, "step": 957300 }, { "epoch": 55.99812832660701, "grad_norm": 0.20044493675231934, "learning_rate": 0.001, "loss": 1.8022, "step": 957400 }, { "epoch": 56.00397730596011, "grad_norm": 0.19947592914104462, "learning_rate": 0.001, "loss": 1.7968, "step": 957500 }, { "epoch": 56.00982628531321, "grad_norm": 0.18017476797103882, "learning_rate": 0.001, "loss": 1.7879, "step": 957600 }, { "epoch": 56.015675264666314, "grad_norm": 0.20649488270282745, "learning_rate": 0.001, "loss": 1.7867, "step": 957700 }, { "epoch": 56.02152424401942, "grad_norm": 0.2040528804063797, "learning_rate": 0.001, "loss": 1.7841, "step": 957800 }, { "epoch": 56.02737322337252, "grad_norm": 0.16952699422836304, "learning_rate": 0.001, "loss": 1.7888, "step": 957900 }, { "epoch": 56.033222202725625, "grad_norm": 0.1853371113538742, "learning_rate": 0.001, "loss": 1.783, "step": 958000 }, { "epoch": 56.03907118207873, "grad_norm": 0.2342005968093872, "learning_rate": 0.001, "loss": 1.7901, "step": 958100 }, { "epoch": 56.04492016143183, "grad_norm": 0.16513654589653015, "learning_rate": 0.001, "loss": 1.7923, "step": 958200 }, { "epoch": 56.050769140784936, "grad_norm": 0.1767067313194275, "learning_rate": 0.001, "loss": 1.7919, "step": 958300 }, { "epoch": 56.05661812013803, "grad_norm": 0.17325246334075928, "learning_rate": 0.001, "loss": 1.7928, "step": 958400 }, { "epoch": 56.06246709949114, "grad_norm": 0.21191827952861786, "learning_rate": 0.001, "loss": 1.7882, "step": 958500 }, { "epoch": 56.06831607884424, "grad_norm": 0.17483378946781158, "learning_rate": 0.001, "loss": 1.7912, "step": 958600 }, { "epoch": 56.074165058197345, "grad_norm": 0.16226908564567566, "learning_rate": 0.001, "loss": 1.7879, "step": 958700 }, { "epoch": 56.08001403755045, "grad_norm": 0.15594182908535004, "learning_rate": 0.001, "loss": 1.7902, "step": 958800 }, { "epoch": 56.08586301690355, "grad_norm": 0.20536759495735168, "learning_rate": 0.001, "loss": 1.7886, "step": 958900 }, { "epoch": 56.091711996256656, "grad_norm": 0.23726467788219452, "learning_rate": 0.001, "loss": 1.7913, "step": 959000 }, { "epoch": 56.09756097560975, "grad_norm": 0.1596798300743103, "learning_rate": 0.001, "loss": 1.7961, "step": 959100 }, { "epoch": 56.103409954962856, "grad_norm": 0.19718018174171448, "learning_rate": 0.001, "loss": 1.7889, "step": 959200 }, { "epoch": 56.10925893431596, "grad_norm": 0.23304615914821625, "learning_rate": 0.001, "loss": 1.7931, "step": 959300 }, { "epoch": 56.115107913669064, "grad_norm": 0.17511901259422302, "learning_rate": 0.001, "loss": 1.7929, "step": 959400 }, { "epoch": 56.12095689302217, "grad_norm": 0.16385842859745026, "learning_rate": 0.001, "loss": 1.7906, "step": 959500 }, { "epoch": 56.12680587237527, "grad_norm": 0.1997421383857727, "learning_rate": 0.001, "loss": 1.7912, "step": 959600 }, { "epoch": 56.132654851728375, "grad_norm": 0.20481278002262115, "learning_rate": 0.001, "loss": 1.7971, "step": 959700 }, { "epoch": 56.13850383108148, "grad_norm": 0.18852709233760834, "learning_rate": 0.001, "loss": 1.7905, "step": 959800 }, { "epoch": 56.144352810434576, "grad_norm": 0.1937093287706375, "learning_rate": 0.001, "loss": 1.7892, "step": 959900 }, { "epoch": 56.15020178978768, "grad_norm": 0.17643797397613525, "learning_rate": 0.001, "loss": 1.7955, "step": 960000 }, { "epoch": 56.15605076914078, "grad_norm": 0.15045872330665588, "learning_rate": 0.001, "loss": 1.7943, "step": 960100 }, { "epoch": 56.16189974849389, "grad_norm": 0.1688019186258316, "learning_rate": 0.001, "loss": 1.7916, "step": 960200 }, { "epoch": 56.16774872784699, "grad_norm": 0.14971569180488586, "learning_rate": 0.001, "loss": 1.7907, "step": 960300 }, { "epoch": 56.173597707200095, "grad_norm": 0.16500850021839142, "learning_rate": 0.001, "loss": 1.7908, "step": 960400 }, { "epoch": 56.1794466865532, "grad_norm": 0.16297321021556854, "learning_rate": 0.001, "loss": 1.7933, "step": 960500 }, { "epoch": 56.1852956659063, "grad_norm": 0.18440283834934235, "learning_rate": 0.001, "loss": 1.7934, "step": 960600 }, { "epoch": 56.1911446452594, "grad_norm": 0.16385769844055176, "learning_rate": 0.001, "loss": 1.7924, "step": 960700 }, { "epoch": 56.1969936246125, "grad_norm": 0.22387069463729858, "learning_rate": 0.001, "loss": 1.7964, "step": 960800 }, { "epoch": 56.20284260396561, "grad_norm": 0.21447189152240753, "learning_rate": 0.001, "loss": 1.7988, "step": 960900 }, { "epoch": 56.20869158331871, "grad_norm": 0.19171147048473358, "learning_rate": 0.001, "loss": 1.7965, "step": 961000 }, { "epoch": 56.214540562671814, "grad_norm": 0.17282940447330475, "learning_rate": 0.001, "loss": 1.7888, "step": 961100 }, { "epoch": 56.22038954202492, "grad_norm": 0.16464638710021973, "learning_rate": 0.001, "loss": 1.7999, "step": 961200 }, { "epoch": 56.22623852137802, "grad_norm": 0.15557971596717834, "learning_rate": 0.001, "loss": 1.794, "step": 961300 }, { "epoch": 56.232087500731126, "grad_norm": 0.18526601791381836, "learning_rate": 0.001, "loss": 1.795, "step": 961400 }, { "epoch": 56.23793648008422, "grad_norm": 0.16615672409534454, "learning_rate": 0.001, "loss": 1.7885, "step": 961500 }, { "epoch": 56.243785459437326, "grad_norm": 0.18114592134952545, "learning_rate": 0.001, "loss": 1.7983, "step": 961600 }, { "epoch": 56.24963443879043, "grad_norm": 0.168924018740654, "learning_rate": 0.001, "loss": 1.7957, "step": 961700 }, { "epoch": 56.255483418143534, "grad_norm": 0.19324924051761627, "learning_rate": 0.001, "loss": 1.795, "step": 961800 }, { "epoch": 56.26133239749664, "grad_norm": 0.16470672190189362, "learning_rate": 0.001, "loss": 1.791, "step": 961900 }, { "epoch": 56.26718137684974, "grad_norm": 0.1941363662481308, "learning_rate": 0.001, "loss": 1.7928, "step": 962000 }, { "epoch": 56.273030356202845, "grad_norm": 0.18640141189098358, "learning_rate": 0.001, "loss": 1.7972, "step": 962100 }, { "epoch": 56.27887933555594, "grad_norm": 0.21050108969211578, "learning_rate": 0.001, "loss": 1.7954, "step": 962200 }, { "epoch": 56.284728314909046, "grad_norm": 0.26405981183052063, "learning_rate": 0.001, "loss": 1.7975, "step": 962300 }, { "epoch": 56.29057729426215, "grad_norm": 0.2068343460559845, "learning_rate": 0.001, "loss": 1.7937, "step": 962400 }, { "epoch": 56.29642627361525, "grad_norm": 0.17177802324295044, "learning_rate": 0.001, "loss": 1.7925, "step": 962500 }, { "epoch": 56.30227525296836, "grad_norm": 0.2304457575082779, "learning_rate": 0.001, "loss": 1.7994, "step": 962600 }, { "epoch": 56.30812423232146, "grad_norm": 0.20699432492256165, "learning_rate": 0.001, "loss": 1.7998, "step": 962700 }, { "epoch": 56.313973211674565, "grad_norm": 0.15672321617603302, "learning_rate": 0.001, "loss": 1.7903, "step": 962800 }, { "epoch": 56.31982219102767, "grad_norm": 0.1994446963071823, "learning_rate": 0.001, "loss": 1.8005, "step": 962900 }, { "epoch": 56.325671170380765, "grad_norm": 0.17800238728523254, "learning_rate": 0.001, "loss": 1.7967, "step": 963000 }, { "epoch": 56.33152014973387, "grad_norm": 0.1715332567691803, "learning_rate": 0.001, "loss": 1.7961, "step": 963100 }, { "epoch": 56.33736912908697, "grad_norm": 0.17327989637851715, "learning_rate": 0.001, "loss": 1.7956, "step": 963200 }, { "epoch": 56.34321810844008, "grad_norm": 0.17618466913700104, "learning_rate": 0.001, "loss": 1.7951, "step": 963300 }, { "epoch": 56.34906708779318, "grad_norm": 0.22618862986564636, "learning_rate": 0.001, "loss": 1.797, "step": 963400 }, { "epoch": 56.354916067146284, "grad_norm": 0.1534169465303421, "learning_rate": 0.001, "loss": 1.7931, "step": 963500 }, { "epoch": 56.36076504649939, "grad_norm": 0.1604505479335785, "learning_rate": 0.001, "loss": 1.7928, "step": 963600 }, { "epoch": 56.36661402585249, "grad_norm": 0.18261098861694336, "learning_rate": 0.001, "loss": 1.7953, "step": 963700 }, { "epoch": 56.37246300520559, "grad_norm": 0.16375397145748138, "learning_rate": 0.001, "loss": 1.7985, "step": 963800 }, { "epoch": 56.37831198455869, "grad_norm": 0.19243782758712769, "learning_rate": 0.001, "loss": 1.7947, "step": 963900 }, { "epoch": 56.384160963911796, "grad_norm": 0.18487350642681122, "learning_rate": 0.001, "loss": 1.795, "step": 964000 }, { "epoch": 56.3900099432649, "grad_norm": 0.20875635743141174, "learning_rate": 0.001, "loss": 1.7992, "step": 964100 }, { "epoch": 56.395858922618004, "grad_norm": 0.18604037165641785, "learning_rate": 0.001, "loss": 1.7959, "step": 964200 }, { "epoch": 56.40170790197111, "grad_norm": 0.16759002208709717, "learning_rate": 0.001, "loss": 1.796, "step": 964300 }, { "epoch": 56.40755688132421, "grad_norm": 0.20493970811367035, "learning_rate": 0.001, "loss": 1.7942, "step": 964400 }, { "epoch": 56.413405860677315, "grad_norm": 0.14860360324382782, "learning_rate": 0.001, "loss": 1.794, "step": 964500 }, { "epoch": 56.41925484003041, "grad_norm": 0.1583453118801117, "learning_rate": 0.001, "loss": 1.7993, "step": 964600 }, { "epoch": 56.425103819383516, "grad_norm": 0.15171782672405243, "learning_rate": 0.001, "loss": 1.7993, "step": 964700 }, { "epoch": 56.43095279873662, "grad_norm": 0.19635112583637238, "learning_rate": 0.001, "loss": 1.8013, "step": 964800 }, { "epoch": 56.43680177808972, "grad_norm": 0.18770058453083038, "learning_rate": 0.001, "loss": 1.799, "step": 964900 }, { "epoch": 56.44265075744283, "grad_norm": 0.16799555718898773, "learning_rate": 0.001, "loss": 1.7991, "step": 965000 }, { "epoch": 56.44849973679593, "grad_norm": 0.20140032470226288, "learning_rate": 0.001, "loss": 1.7902, "step": 965100 }, { "epoch": 56.454348716149035, "grad_norm": 0.17325100302696228, "learning_rate": 0.001, "loss": 1.7948, "step": 965200 }, { "epoch": 56.46019769550213, "grad_norm": 0.18201136589050293, "learning_rate": 0.001, "loss": 1.7941, "step": 965300 }, { "epoch": 56.466046674855235, "grad_norm": 0.1674443930387497, "learning_rate": 0.001, "loss": 1.8009, "step": 965400 }, { "epoch": 56.47189565420834, "grad_norm": 0.1879129558801651, "learning_rate": 0.001, "loss": 1.7924, "step": 965500 }, { "epoch": 56.47774463356144, "grad_norm": 0.2051096260547638, "learning_rate": 0.001, "loss": 1.7909, "step": 965600 }, { "epoch": 56.48359361291455, "grad_norm": 0.18273182213306427, "learning_rate": 0.001, "loss": 1.7916, "step": 965700 }, { "epoch": 56.48944259226765, "grad_norm": 0.18245984613895416, "learning_rate": 0.001, "loss": 1.7957, "step": 965800 }, { "epoch": 56.495291571620754, "grad_norm": 0.16500288248062134, "learning_rate": 0.001, "loss": 1.796, "step": 965900 }, { "epoch": 56.50114055097386, "grad_norm": 0.1730683296918869, "learning_rate": 0.001, "loss": 1.8025, "step": 966000 }, { "epoch": 56.506989530326955, "grad_norm": 0.21134765446186066, "learning_rate": 0.001, "loss": 1.8011, "step": 966100 }, { "epoch": 56.51283850968006, "grad_norm": 0.17080356180667877, "learning_rate": 0.001, "loss": 1.7998, "step": 966200 }, { "epoch": 56.51868748903316, "grad_norm": 0.21924927830696106, "learning_rate": 0.001, "loss": 1.7944, "step": 966300 }, { "epoch": 56.524536468386266, "grad_norm": 0.15720856189727783, "learning_rate": 0.001, "loss": 1.7954, "step": 966400 }, { "epoch": 56.53038544773937, "grad_norm": 0.19488535821437836, "learning_rate": 0.001, "loss": 1.7997, "step": 966500 }, { "epoch": 56.536234427092474, "grad_norm": 0.18004228174686432, "learning_rate": 0.001, "loss": 1.7989, "step": 966600 }, { "epoch": 56.54208340644558, "grad_norm": 0.19074589014053345, "learning_rate": 0.001, "loss": 1.7972, "step": 966700 }, { "epoch": 56.54793238579868, "grad_norm": 0.18022149801254272, "learning_rate": 0.001, "loss": 1.7938, "step": 966800 }, { "epoch": 56.55378136515178, "grad_norm": 0.19497394561767578, "learning_rate": 0.001, "loss": 1.7957, "step": 966900 }, { "epoch": 56.55963034450488, "grad_norm": 0.170572891831398, "learning_rate": 0.001, "loss": 1.7956, "step": 967000 }, { "epoch": 56.565479323857986, "grad_norm": 0.16130706667900085, "learning_rate": 0.001, "loss": 1.8058, "step": 967100 }, { "epoch": 56.57132830321109, "grad_norm": 0.19987952709197998, "learning_rate": 0.001, "loss": 1.7954, "step": 967200 }, { "epoch": 56.57717728256419, "grad_norm": 0.2404319792985916, "learning_rate": 0.001, "loss": 1.7999, "step": 967300 }, { "epoch": 56.5830262619173, "grad_norm": 0.17827171087265015, "learning_rate": 0.001, "loss": 1.7934, "step": 967400 }, { "epoch": 56.5888752412704, "grad_norm": 0.1629689633846283, "learning_rate": 0.001, "loss": 1.7962, "step": 967500 }, { "epoch": 56.594724220623505, "grad_norm": 0.18275956809520721, "learning_rate": 0.001, "loss": 1.7945, "step": 967600 }, { "epoch": 56.6005731999766, "grad_norm": 0.2039315402507782, "learning_rate": 0.001, "loss": 1.7942, "step": 967700 }, { "epoch": 56.606422179329705, "grad_norm": 0.18712027370929718, "learning_rate": 0.001, "loss": 1.7975, "step": 967800 }, { "epoch": 56.61227115868281, "grad_norm": 0.15893587470054626, "learning_rate": 0.001, "loss": 1.797, "step": 967900 }, { "epoch": 56.61812013803591, "grad_norm": 0.15346547961235046, "learning_rate": 0.001, "loss": 1.7984, "step": 968000 }, { "epoch": 56.62396911738902, "grad_norm": 0.19058124721050262, "learning_rate": 0.001, "loss": 1.7929, "step": 968100 }, { "epoch": 56.62981809674212, "grad_norm": 0.2214646190404892, "learning_rate": 0.001, "loss": 1.7998, "step": 968200 }, { "epoch": 56.635667076095224, "grad_norm": 0.2047048956155777, "learning_rate": 0.001, "loss": 1.8005, "step": 968300 }, { "epoch": 56.64151605544832, "grad_norm": 0.1954025775194168, "learning_rate": 0.001, "loss": 1.8024, "step": 968400 }, { "epoch": 56.647365034801425, "grad_norm": 0.1651843786239624, "learning_rate": 0.001, "loss": 1.8038, "step": 968500 }, { "epoch": 56.65321401415453, "grad_norm": 0.23257103562355042, "learning_rate": 0.001, "loss": 1.8027, "step": 968600 }, { "epoch": 56.65906299350763, "grad_norm": 0.2271033674478531, "learning_rate": 0.001, "loss": 1.7969, "step": 968700 }, { "epoch": 56.664911972860736, "grad_norm": 0.18666796386241913, "learning_rate": 0.001, "loss": 1.7991, "step": 968800 }, { "epoch": 56.67076095221384, "grad_norm": 0.15271654725074768, "learning_rate": 0.001, "loss": 1.7976, "step": 968900 }, { "epoch": 56.676609931566944, "grad_norm": 0.18704798817634583, "learning_rate": 0.001, "loss": 1.8026, "step": 969000 }, { "epoch": 56.68245891092005, "grad_norm": 0.1810653805732727, "learning_rate": 0.001, "loss": 1.8023, "step": 969100 }, { "epoch": 56.688307890273144, "grad_norm": 0.14951981604099274, "learning_rate": 0.001, "loss": 1.7955, "step": 969200 }, { "epoch": 56.69415686962625, "grad_norm": 0.18999208509922028, "learning_rate": 0.001, "loss": 1.7934, "step": 969300 }, { "epoch": 56.70000584897935, "grad_norm": 0.14959849417209625, "learning_rate": 0.001, "loss": 1.7977, "step": 969400 }, { "epoch": 56.705854828332455, "grad_norm": 0.1728116124868393, "learning_rate": 0.001, "loss": 1.8048, "step": 969500 }, { "epoch": 56.71170380768556, "grad_norm": 0.18561607599258423, "learning_rate": 0.001, "loss": 1.7958, "step": 969600 }, { "epoch": 56.71755278703866, "grad_norm": 0.18376941978931427, "learning_rate": 0.001, "loss": 1.7996, "step": 969700 }, { "epoch": 56.72340176639177, "grad_norm": 0.14435036480426788, "learning_rate": 0.001, "loss": 1.8039, "step": 969800 }, { "epoch": 56.72925074574487, "grad_norm": 0.2200029194355011, "learning_rate": 0.001, "loss": 1.7934, "step": 969900 }, { "epoch": 56.73509972509797, "grad_norm": 0.18035002052783966, "learning_rate": 0.001, "loss": 1.7981, "step": 970000 }, { "epoch": 56.74094870445107, "grad_norm": 0.24735943973064423, "learning_rate": 0.001, "loss": 1.8011, "step": 970100 }, { "epoch": 56.746797683804175, "grad_norm": 0.17801396548748016, "learning_rate": 0.001, "loss": 1.8052, "step": 970200 }, { "epoch": 56.75264666315728, "grad_norm": 0.16073541343212128, "learning_rate": 0.001, "loss": 1.7972, "step": 970300 }, { "epoch": 56.75849564251038, "grad_norm": 0.18140895664691925, "learning_rate": 0.001, "loss": 1.7994, "step": 970400 }, { "epoch": 56.764344621863486, "grad_norm": 0.2899450361728668, "learning_rate": 0.001, "loss": 1.8011, "step": 970500 }, { "epoch": 56.77019360121659, "grad_norm": 0.1546018272638321, "learning_rate": 0.001, "loss": 1.8005, "step": 970600 }, { "epoch": 56.776042580569694, "grad_norm": 0.158040851354599, "learning_rate": 0.001, "loss": 1.7938, "step": 970700 }, { "epoch": 56.78189155992279, "grad_norm": 0.16221386194229126, "learning_rate": 0.001, "loss": 1.8034, "step": 970800 }, { "epoch": 56.787740539275894, "grad_norm": 0.24630743265151978, "learning_rate": 0.001, "loss": 1.804, "step": 970900 }, { "epoch": 56.793589518629, "grad_norm": 0.14387871325016022, "learning_rate": 0.001, "loss": 1.7981, "step": 971000 }, { "epoch": 56.7994384979821, "grad_norm": 0.1532963067293167, "learning_rate": 0.001, "loss": 1.7977, "step": 971100 }, { "epoch": 56.805287477335206, "grad_norm": 0.23698486387729645, "learning_rate": 0.001, "loss": 1.7964, "step": 971200 }, { "epoch": 56.81113645668831, "grad_norm": 0.14881539344787598, "learning_rate": 0.001, "loss": 1.8002, "step": 971300 }, { "epoch": 56.81698543604141, "grad_norm": 0.16701294481754303, "learning_rate": 0.001, "loss": 1.7994, "step": 971400 }, { "epoch": 56.82283441539451, "grad_norm": 0.15429793298244476, "learning_rate": 0.001, "loss": 1.8009, "step": 971500 }, { "epoch": 56.828683394747614, "grad_norm": 0.1909763664007187, "learning_rate": 0.001, "loss": 1.8035, "step": 971600 }, { "epoch": 56.83453237410072, "grad_norm": 0.2568204700946808, "learning_rate": 0.001, "loss": 1.8071, "step": 971700 }, { "epoch": 56.84038135345382, "grad_norm": 0.21645815670490265, "learning_rate": 0.001, "loss": 1.8046, "step": 971800 }, { "epoch": 56.846230332806925, "grad_norm": 0.21314162015914917, "learning_rate": 0.001, "loss": 1.802, "step": 971900 }, { "epoch": 56.85207931216003, "grad_norm": 0.16168750822544098, "learning_rate": 0.001, "loss": 1.8039, "step": 972000 }, { "epoch": 56.85792829151313, "grad_norm": 0.17694319784641266, "learning_rate": 0.001, "loss": 1.7986, "step": 972100 }, { "epoch": 56.86377727086624, "grad_norm": 0.17585502564907074, "learning_rate": 0.001, "loss": 1.7981, "step": 972200 }, { "epoch": 56.86962625021933, "grad_norm": 0.17237207293510437, "learning_rate": 0.001, "loss": 1.8031, "step": 972300 }, { "epoch": 56.87547522957244, "grad_norm": 0.18537333607673645, "learning_rate": 0.001, "loss": 1.8002, "step": 972400 }, { "epoch": 56.88132420892554, "grad_norm": 0.16837406158447266, "learning_rate": 0.001, "loss": 1.7924, "step": 972500 }, { "epoch": 56.887173188278645, "grad_norm": 0.18739053606987, "learning_rate": 0.001, "loss": 1.8018, "step": 972600 }, { "epoch": 56.89302216763175, "grad_norm": 0.19422568380832672, "learning_rate": 0.001, "loss": 1.8005, "step": 972700 }, { "epoch": 56.89887114698485, "grad_norm": 0.15938077867031097, "learning_rate": 0.001, "loss": 1.7976, "step": 972800 }, { "epoch": 56.904720126337956, "grad_norm": 0.1763966977596283, "learning_rate": 0.001, "loss": 1.8089, "step": 972900 }, { "epoch": 56.91056910569106, "grad_norm": 0.21018733084201813, "learning_rate": 0.001, "loss": 1.8036, "step": 973000 }, { "epoch": 56.91641808504416, "grad_norm": 0.16118821501731873, "learning_rate": 0.001, "loss": 1.8048, "step": 973100 }, { "epoch": 56.92226706439726, "grad_norm": 0.20432627201080322, "learning_rate": 0.001, "loss": 1.8013, "step": 973200 }, { "epoch": 56.928116043750364, "grad_norm": 0.16340318322181702, "learning_rate": 0.001, "loss": 1.7997, "step": 973300 }, { "epoch": 56.93396502310347, "grad_norm": 0.20526498556137085, "learning_rate": 0.001, "loss": 1.8099, "step": 973400 }, { "epoch": 56.93981400245657, "grad_norm": 0.18010644614696503, "learning_rate": 0.001, "loss": 1.8002, "step": 973500 }, { "epoch": 56.945662981809676, "grad_norm": 0.22801260650157928, "learning_rate": 0.001, "loss": 1.7984, "step": 973600 }, { "epoch": 56.95151196116278, "grad_norm": 0.17900586128234863, "learning_rate": 0.001, "loss": 1.7995, "step": 973700 }, { "epoch": 56.95736094051588, "grad_norm": 0.2150951474905014, "learning_rate": 0.001, "loss": 1.7989, "step": 973800 }, { "epoch": 56.96320991986898, "grad_norm": 0.19950605928897858, "learning_rate": 0.001, "loss": 1.8042, "step": 973900 }, { "epoch": 56.969058899222084, "grad_norm": 0.18596410751342773, "learning_rate": 0.001, "loss": 1.8017, "step": 974000 }, { "epoch": 56.97490787857519, "grad_norm": 0.2011069655418396, "learning_rate": 0.001, "loss": 1.8004, "step": 974100 }, { "epoch": 56.98075685792829, "grad_norm": 0.20858462154865265, "learning_rate": 0.001, "loss": 1.8049, "step": 974200 }, { "epoch": 56.986605837281395, "grad_norm": 0.1708568036556244, "learning_rate": 0.001, "loss": 1.8003, "step": 974300 }, { "epoch": 56.9924548166345, "grad_norm": 0.21535462141036987, "learning_rate": 0.001, "loss": 1.8058, "step": 974400 }, { "epoch": 56.9983037959876, "grad_norm": 0.20342150330543518, "learning_rate": 0.001, "loss": 1.8031, "step": 974500 }, { "epoch": 57.0041527753407, "grad_norm": 0.19501109421253204, "learning_rate": 0.001, "loss": 1.7837, "step": 974600 }, { "epoch": 57.0100017546938, "grad_norm": 0.18316513299942017, "learning_rate": 0.001, "loss": 1.7871, "step": 974700 }, { "epoch": 57.01585073404691, "grad_norm": 0.1874546855688095, "learning_rate": 0.001, "loss": 1.7836, "step": 974800 }, { "epoch": 57.02169971340001, "grad_norm": 0.1851603090763092, "learning_rate": 0.001, "loss": 1.7864, "step": 974900 }, { "epoch": 57.027548692753115, "grad_norm": 0.1694696843624115, "learning_rate": 0.001, "loss": 1.7857, "step": 975000 }, { "epoch": 57.027548692753115, "eval_ag_news_accuracy": 0.247140625, "eval_ag_news_bleu_score": 7.4836815810229576, "eval_ag_news_bleu_score_sem": 0.5132611863698439, "eval_ag_news_emb_cos_sim": 0.7033215165138245, "eval_ag_news_emb_cos_sim_sem": 0.015558334067463875, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.672947645187378, "eval_ag_news_n_ngrams_match_1": 15.1015625, "eval_ag_news_n_ngrams_match_2": 4.609375, "eval_ag_news_n_ngrams_match_3": 1.65625, "eval_ag_news_num_pred_words": 46.71875, "eval_ag_news_num_true_words": 45.6484375, "eval_ag_news_perplexity": 14.48259589182815, "eval_ag_news_pred_num_tokens": 70.3515625, "eval_ag_news_rouge_score": 0.31975764349285907, "eval_ag_news_runtime": 36.6807, "eval_ag_news_samples_per_second": 13.631, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.34315716934203283, "eval_ag_news_token_set_f1_sem": 0.010156361493476174, "eval_ag_news_token_set_precision": 0.32814136865244314, "eval_ag_news_token_set_recall": 0.3660703295008939, "eval_ag_news_true_num_tokens": 63.3828125, "step": 975000 }, { "epoch": 57.027548692753115, "eval_anthropic_toxic_prompts_accuracy": 0.103953125, "eval_anthropic_toxic_prompts_bleu_score": 42.11633477611426, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.575308443303161, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8807976841926575, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010980227962136269, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.984375, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.011004959233105183, "eval_anthropic_toxic_prompts_exact_match": 0.1328125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.030114393430435732, "eval_anthropic_toxic_prompts_loss": 1.2673100233078003, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.46875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.0234375, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.921875, "eval_anthropic_toxic_prompts_num_pred_words": 15.578125, "eval_anthropic_toxic_prompts_num_true_words": 14.796875, "eval_anthropic_toxic_prompts_perplexity": 3.5512868235337813, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.5, "eval_anthropic_toxic_prompts_rouge_score": 0.6767547751616757, "eval_anthropic_toxic_prompts_runtime": 30.0097, "eval_anthropic_toxic_prompts_samples_per_second": 16.661, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.6947985781320709, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018790534038037196, "eval_anthropic_toxic_prompts_token_set_precision": 0.7015366976399358, "eval_anthropic_toxic_prompts_token_set_recall": 0.6969436031868115, "eval_anthropic_toxic_prompts_true_num_tokens": 18.140625, "step": 975000 }, { "epoch": 57.027548692753115, "eval_arxiv_accuracy": 0.374484375, "eval_arxiv_bleu_score": 1.5047383816198754, "eval_arxiv_bleu_score_sem": 0.1614799539395378, "eval_arxiv_emb_cos_sim": 0.4245882034301758, "eval_arxiv_emb_cos_sim_sem": 0.01698790118098259, "eval_arxiv_emb_top1_equal": 0.875, "eval_arxiv_emb_top1_equal_sem": 0.029346559196710587, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4157650470733643, "eval_arxiv_n_ngrams_match_1": 12.8671875, "eval_arxiv_n_ngrams_match_2": 2.0703125, "eval_arxiv_n_ngrams_match_3": 0.3046875, "eval_arxiv_num_pred_words": 51.4765625, "eval_arxiv_num_true_words": 86.0703125, "eval_arxiv_perplexity": 30.440228729450865, "eval_arxiv_pred_num_tokens": 126.4296875, "eval_arxiv_rouge_score": 0.1684907855615282, "eval_arxiv_runtime": 31.1001, "eval_arxiv_samples_per_second": 16.077, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.17448907799776786, "eval_arxiv_token_set_f1_sem": 0.00799971434246528, "eval_arxiv_token_set_precision": 0.11900279294888867, "eval_arxiv_token_set_recall": 0.43765529670167097, "eval_arxiv_true_num_tokens": 124.7109375, "step": 975000 }, { "epoch": 57.027548692753115, "eval_python_code_alpaca_accuracy": 0.13190625, "eval_python_code_alpaca_bleu_score": 28.065769126063845, "eval_python_code_alpaca_bleu_score_sem": 1.5074461966250066, "eval_python_code_alpaca_emb_cos_sim": 0.8732285499572754, "eval_python_code_alpaca_emb_cos_sim_sem": 0.00993634108453989, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.5173485279083252, "eval_python_code_alpaca_n_ngrams_match_1": 11.203125, "eval_python_code_alpaca_n_ngrams_match_2": 6.1640625, "eval_python_code_alpaca_n_ngrams_match_3": 3.4453125, "eval_python_code_alpaca_num_pred_words": 18.6015625, "eval_python_code_alpaca_num_true_words": 19.484375, "eval_python_code_alpaca_perplexity": 4.560118125512036, "eval_python_code_alpaca_pred_num_tokens": 24.6015625, "eval_python_code_alpaca_rouge_score": 0.6127189855986032, "eval_python_code_alpaca_runtime": 30.4388, "eval_python_code_alpaca_samples_per_second": 16.426, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6207594929668425, "eval_python_code_alpaca_token_set_f1_sem": 0.013120547150987528, "eval_python_code_alpaca_token_set_precision": 0.6118576893185412, "eval_python_code_alpaca_token_set_recall": 0.6348819908634507, "eval_python_code_alpaca_true_num_tokens": 24.875, "step": 975000 }, { "epoch": 57.027548692753115, "eval_wikibio_accuracy": 0.3709375, "eval_wikibio_bleu_score": 8.152007275881175, "eval_wikibio_bleu_score_sem": 0.7083862212970048, "eval_wikibio_emb_cos_sim": 0.6047844290733337, "eval_wikibio_emb_cos_sim_sem": 0.02236337400972843, "eval_wikibio_emb_top1_equal": 0.9140625, "eval_wikibio_emb_top1_equal_sem": 0.024870097637176514, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.705235004425049, "eval_wikibio_n_ngrams_match_1": 15.8828125, "eval_wikibio_n_ngrams_match_2": 5.765625, "eval_wikibio_n_ngrams_match_3": 2.578125, "eval_wikibio_num_pred_words": 52.0703125, "eval_wikibio_num_true_words": 51.984375, "eval_wikibio_perplexity": 14.957831434095876, "eval_wikibio_pred_num_tokens": 106.109375, "eval_wikibio_rouge_score": 0.3137655826093305, "eval_wikibio_runtime": 31.4427, "eval_wikibio_samples_per_second": 15.902, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.3339425042171436, "eval_wikibio_token_set_f1_sem": 0.012031351483131277, "eval_wikibio_token_set_precision": 0.30172265919524915, "eval_wikibio_token_set_recall": 0.4032832412463141, "eval_wikibio_true_num_tokens": 98.3515625, "step": 975000 }, { "epoch": 57.027548692753115, "eval_msmarco_accuracy": 0.395515625, "eval_msmarco_bleu_score": 17.91016679218144, "eval_msmarco_bleu_score_sem": 1.3878831385715291, "eval_msmarco_emb_cos_sim": 0.7731181383132935, "eval_msmarco_emb_cos_sim_sem": 0.01636776514351368, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.721114993095398, "eval_msmarco_n_ngrams_match_1": 29.4609375, "eval_msmarco_n_ngrams_match_2": 13.0390625, "eval_msmarco_n_ngrams_match_3": 7.625, "eval_msmarco_num_pred_words": 63.8671875, "eval_msmarco_num_true_words": 65.1015625, "eval_msmarco_perplexity": 5.590758647615026, "eval_msmarco_pred_num_tokens": 88.0390625, "eval_msmarco_rouge_score": 0.4455985652280755, "eval_msmarco_runtime": 26.7455, "eval_msmarco_samples_per_second": 18.695, "eval_msmarco_steps_per_second": 0.037, "eval_msmarco_token_set_f1": 0.4621164659748341, "eval_msmarco_token_set_f1_sem": 0.013595204823003283, "eval_msmarco_token_set_precision": 0.42735876520956506, "eval_msmarco_token_set_recall": 0.5187952777266934, "eval_msmarco_true_num_tokens": 85.0703125, "step": 975000 }, { "epoch": 57.03339767210622, "grad_norm": 0.19706493616104126, "learning_rate": 0.001, "loss": 1.7854, "step": 975100 }, { "epoch": 57.03924665145932, "grad_norm": 0.20074304938316345, "learning_rate": 0.001, "loss": 1.7837, "step": 975200 }, { "epoch": 57.045095630812426, "grad_norm": 0.16046172380447388, "learning_rate": 0.001, "loss": 1.7893, "step": 975300 }, { "epoch": 57.05094461016552, "grad_norm": 0.18979598581790924, "learning_rate": 0.001, "loss": 1.7843, "step": 975400 }, { "epoch": 57.05679358951863, "grad_norm": 0.18164069950580597, "learning_rate": 0.001, "loss": 1.7867, "step": 975500 }, { "epoch": 57.06264256887173, "grad_norm": 0.195576012134552, "learning_rate": 0.001, "loss": 1.7928, "step": 975600 }, { "epoch": 57.068491548224834, "grad_norm": 0.1637430340051651, "learning_rate": 0.001, "loss": 1.7902, "step": 975700 }, { "epoch": 57.07434052757794, "grad_norm": 0.1524847447872162, "learning_rate": 0.001, "loss": 1.7911, "step": 975800 }, { "epoch": 57.08018950693104, "grad_norm": 0.20567330718040466, "learning_rate": 0.001, "loss": 1.789, "step": 975900 }, { "epoch": 57.086038486284146, "grad_norm": 0.22773289680480957, "learning_rate": 0.001, "loss": 1.7948, "step": 976000 }, { "epoch": 57.09188746563725, "grad_norm": 0.15385276079177856, "learning_rate": 0.001, "loss": 1.7937, "step": 976100 }, { "epoch": 57.097736444990346, "grad_norm": 0.25700104236602783, "learning_rate": 0.001, "loss": 1.7875, "step": 976200 }, { "epoch": 57.10358542434345, "grad_norm": 0.19344455003738403, "learning_rate": 0.001, "loss": 1.787, "step": 976300 }, { "epoch": 57.109434403696554, "grad_norm": 0.20690655708312988, "learning_rate": 0.001, "loss": 1.7891, "step": 976400 }, { "epoch": 57.11528338304966, "grad_norm": 0.17974239587783813, "learning_rate": 0.001, "loss": 1.7897, "step": 976500 }, { "epoch": 57.12113236240276, "grad_norm": 0.23682895302772522, "learning_rate": 0.001, "loss": 1.7907, "step": 976600 }, { "epoch": 57.126981341755865, "grad_norm": 0.23195648193359375, "learning_rate": 0.001, "loss": 1.7897, "step": 976700 }, { "epoch": 57.13283032110897, "grad_norm": 0.17489758133888245, "learning_rate": 0.001, "loss": 1.7897, "step": 976800 }, { "epoch": 57.13867930046207, "grad_norm": 0.18169885873794556, "learning_rate": 0.001, "loss": 1.7932, "step": 976900 }, { "epoch": 57.14452827981517, "grad_norm": 0.20694398880004883, "learning_rate": 0.001, "loss": 1.7891, "step": 977000 }, { "epoch": 57.15037725916827, "grad_norm": 0.18916764855384827, "learning_rate": 0.001, "loss": 1.7902, "step": 977100 }, { "epoch": 57.15622623852138, "grad_norm": 0.16323013603687286, "learning_rate": 0.001, "loss": 1.7922, "step": 977200 }, { "epoch": 57.16207521787448, "grad_norm": 0.17238271236419678, "learning_rate": 0.001, "loss": 1.7937, "step": 977300 }, { "epoch": 57.167924197227585, "grad_norm": 0.18650582432746887, "learning_rate": 0.001, "loss": 1.7942, "step": 977400 }, { "epoch": 57.17377317658069, "grad_norm": 0.14992403984069824, "learning_rate": 0.001, "loss": 1.7979, "step": 977500 }, { "epoch": 57.17962215593379, "grad_norm": 0.16096365451812744, "learning_rate": 0.001, "loss": 1.7893, "step": 977600 }, { "epoch": 57.18547113528689, "grad_norm": 0.22521300613880157, "learning_rate": 0.001, "loss": 1.7901, "step": 977700 }, { "epoch": 57.19132011463999, "grad_norm": 0.16926519572734833, "learning_rate": 0.001, "loss": 1.7893, "step": 977800 }, { "epoch": 57.1971690939931, "grad_norm": 0.2166345864534378, "learning_rate": 0.001, "loss": 1.7984, "step": 977900 }, { "epoch": 57.2030180733462, "grad_norm": 0.14637036621570587, "learning_rate": 0.001, "loss": 1.793, "step": 978000 }, { "epoch": 57.208867052699304, "grad_norm": 0.18517547845840454, "learning_rate": 0.001, "loss": 1.7862, "step": 978100 }, { "epoch": 57.21471603205241, "grad_norm": 0.22636334598064423, "learning_rate": 0.001, "loss": 1.7946, "step": 978200 }, { "epoch": 57.22056501140551, "grad_norm": 0.20287056267261505, "learning_rate": 0.001, "loss": 1.7906, "step": 978300 }, { "epoch": 57.226413990758616, "grad_norm": 0.1641698181629181, "learning_rate": 0.001, "loss": 1.7893, "step": 978400 }, { "epoch": 57.23226297011171, "grad_norm": 0.15890032052993774, "learning_rate": 0.001, "loss": 1.7897, "step": 978500 }, { "epoch": 57.238111949464816, "grad_norm": 0.16916240751743317, "learning_rate": 0.001, "loss": 1.7916, "step": 978600 }, { "epoch": 57.24396092881792, "grad_norm": 0.20026826858520508, "learning_rate": 0.001, "loss": 1.7893, "step": 978700 }, { "epoch": 57.249809908171024, "grad_norm": 0.20609895884990692, "learning_rate": 0.001, "loss": 1.7951, "step": 978800 }, { "epoch": 57.25565888752413, "grad_norm": 0.18339113891124725, "learning_rate": 0.001, "loss": 1.7895, "step": 978900 }, { "epoch": 57.26150786687723, "grad_norm": 0.15640215575695038, "learning_rate": 0.001, "loss": 1.7957, "step": 979000 }, { "epoch": 57.267356846230335, "grad_norm": 0.1965882033109665, "learning_rate": 0.001, "loss": 1.7911, "step": 979100 }, { "epoch": 57.27320582558344, "grad_norm": 0.21738086640834808, "learning_rate": 0.001, "loss": 1.7919, "step": 979200 }, { "epoch": 57.279054804936536, "grad_norm": 0.1663469672203064, "learning_rate": 0.001, "loss": 1.7909, "step": 979300 }, { "epoch": 57.28490378428964, "grad_norm": 0.15934208035469055, "learning_rate": 0.001, "loss": 1.7932, "step": 979400 }, { "epoch": 57.29075276364274, "grad_norm": 0.19806109368801117, "learning_rate": 0.001, "loss": 1.7944, "step": 979500 }, { "epoch": 57.29660174299585, "grad_norm": 0.23939089477062225, "learning_rate": 0.001, "loss": 1.7966, "step": 979600 }, { "epoch": 57.30245072234895, "grad_norm": 0.21823976933956146, "learning_rate": 0.001, "loss": 1.7919, "step": 979700 }, { "epoch": 57.308299701702055, "grad_norm": 0.2258448451757431, "learning_rate": 0.001, "loss": 1.7917, "step": 979800 }, { "epoch": 57.31414868105516, "grad_norm": 0.19521822035312653, "learning_rate": 0.001, "loss": 1.7979, "step": 979900 }, { "epoch": 57.31999766040826, "grad_norm": 0.21349875628948212, "learning_rate": 0.001, "loss": 1.7996, "step": 980000 }, { "epoch": 57.32584663976136, "grad_norm": 0.16523730754852295, "learning_rate": 0.001, "loss": 1.7877, "step": 980100 }, { "epoch": 57.33169561911446, "grad_norm": 0.20198917388916016, "learning_rate": 0.001, "loss": 1.7932, "step": 980200 }, { "epoch": 57.33754459846757, "grad_norm": 0.20073947310447693, "learning_rate": 0.001, "loss": 1.7913, "step": 980300 }, { "epoch": 57.34339357782067, "grad_norm": 0.18516874313354492, "learning_rate": 0.001, "loss": 1.7955, "step": 980400 }, { "epoch": 57.349242557173774, "grad_norm": 0.17465618252754211, "learning_rate": 0.001, "loss": 1.789, "step": 980500 }, { "epoch": 57.35509153652688, "grad_norm": 0.17996397614479065, "learning_rate": 0.001, "loss": 1.7946, "step": 980600 }, { "epoch": 57.36094051587998, "grad_norm": 0.16140615940093994, "learning_rate": 0.001, "loss": 1.7972, "step": 980700 }, { "epoch": 57.36678949523308, "grad_norm": 0.22543853521347046, "learning_rate": 0.001, "loss": 1.7929, "step": 980800 }, { "epoch": 57.37263847458618, "grad_norm": 0.168333500623703, "learning_rate": 0.001, "loss": 1.7903, "step": 980900 }, { "epoch": 57.378487453939286, "grad_norm": 0.147856205701828, "learning_rate": 0.001, "loss": 1.7914, "step": 981000 }, { "epoch": 57.38433643329239, "grad_norm": 0.13480156660079956, "learning_rate": 0.001, "loss": 1.7902, "step": 981100 }, { "epoch": 57.390185412645494, "grad_norm": 0.18931904435157776, "learning_rate": 0.001, "loss": 1.7944, "step": 981200 }, { "epoch": 57.3960343919986, "grad_norm": 0.28261539340019226, "learning_rate": 0.001, "loss": 1.7939, "step": 981300 }, { "epoch": 57.4018833713517, "grad_norm": 0.24956175684928894, "learning_rate": 0.001, "loss": 1.7993, "step": 981400 }, { "epoch": 57.407732350704805, "grad_norm": 0.1745981127023697, "learning_rate": 0.001, "loss": 1.7958, "step": 981500 }, { "epoch": 57.4135813300579, "grad_norm": 0.1633889526128769, "learning_rate": 0.001, "loss": 1.7961, "step": 981600 }, { "epoch": 57.419430309411005, "grad_norm": 0.3124314546585083, "learning_rate": 0.001, "loss": 1.7998, "step": 981700 }, { "epoch": 57.42527928876411, "grad_norm": 0.1930365115404129, "learning_rate": 0.001, "loss": 1.7948, "step": 981800 }, { "epoch": 57.43112826811721, "grad_norm": 0.1648758500814438, "learning_rate": 0.001, "loss": 1.7909, "step": 981900 }, { "epoch": 57.43697724747032, "grad_norm": 0.16528934240341187, "learning_rate": 0.001, "loss": 1.7949, "step": 982000 }, { "epoch": 57.44282622682342, "grad_norm": 0.17579784989356995, "learning_rate": 0.001, "loss": 1.7944, "step": 982100 }, { "epoch": 57.448675206176524, "grad_norm": 0.1912403702735901, "learning_rate": 0.001, "loss": 1.7893, "step": 982200 }, { "epoch": 57.45452418552963, "grad_norm": 0.1828821897506714, "learning_rate": 0.001, "loss": 1.7955, "step": 982300 }, { "epoch": 57.460373164882725, "grad_norm": 0.1511002629995346, "learning_rate": 0.001, "loss": 1.8008, "step": 982400 }, { "epoch": 57.46622214423583, "grad_norm": 0.20619097352027893, "learning_rate": 0.001, "loss": 1.7964, "step": 982500 }, { "epoch": 57.47207112358893, "grad_norm": 0.2572878301143646, "learning_rate": 0.001, "loss": 1.7979, "step": 982600 }, { "epoch": 57.477920102942036, "grad_norm": 0.15788744390010834, "learning_rate": 0.001, "loss": 1.791, "step": 982700 }, { "epoch": 57.48376908229514, "grad_norm": 0.19526205956935883, "learning_rate": 0.001, "loss": 1.7942, "step": 982800 }, { "epoch": 57.489618061648244, "grad_norm": 0.1880405843257904, "learning_rate": 0.001, "loss": 1.7981, "step": 982900 }, { "epoch": 57.49546704100135, "grad_norm": 0.21252195537090302, "learning_rate": 0.001, "loss": 1.7917, "step": 983000 }, { "epoch": 57.50131602035445, "grad_norm": 0.1849905252456665, "learning_rate": 0.001, "loss": 1.7947, "step": 983100 }, { "epoch": 57.50716499970755, "grad_norm": 0.17687547206878662, "learning_rate": 0.001, "loss": 1.7962, "step": 983200 }, { "epoch": 57.51301397906065, "grad_norm": 0.165318101644516, "learning_rate": 0.001, "loss": 1.7932, "step": 983300 }, { "epoch": 57.518862958413756, "grad_norm": 0.15784908831119537, "learning_rate": 0.001, "loss": 1.7931, "step": 983400 }, { "epoch": 57.52471193776686, "grad_norm": 0.1648096889257431, "learning_rate": 0.001, "loss": 1.793, "step": 983500 }, { "epoch": 57.53056091711996, "grad_norm": 0.1689615547657013, "learning_rate": 0.001, "loss": 1.797, "step": 983600 }, { "epoch": 57.53640989647307, "grad_norm": 0.22505171597003937, "learning_rate": 0.001, "loss": 1.7922, "step": 983700 }, { "epoch": 57.54225887582617, "grad_norm": 0.17110267281532288, "learning_rate": 0.001, "loss": 1.7951, "step": 983800 }, { "epoch": 57.54810785517927, "grad_norm": 0.1733420044183731, "learning_rate": 0.001, "loss": 1.7926, "step": 983900 }, { "epoch": 57.55395683453237, "grad_norm": 0.24310730397701263, "learning_rate": 0.001, "loss": 1.8011, "step": 984000 }, { "epoch": 57.559805813885475, "grad_norm": 0.20628398656845093, "learning_rate": 0.001, "loss": 1.7988, "step": 984100 }, { "epoch": 57.56565479323858, "grad_norm": 0.22198496758937836, "learning_rate": 0.001, "loss": 1.7988, "step": 984200 }, { "epoch": 57.57150377259168, "grad_norm": 0.1651669591665268, "learning_rate": 0.001, "loss": 1.8069, "step": 984300 }, { "epoch": 57.57735275194479, "grad_norm": 0.1915457844734192, "learning_rate": 0.001, "loss": 1.7995, "step": 984400 }, { "epoch": 57.58320173129789, "grad_norm": 0.1599539816379547, "learning_rate": 0.001, "loss": 1.7961, "step": 984500 }, { "epoch": 57.589050710650994, "grad_norm": 0.18345198035240173, "learning_rate": 0.001, "loss": 1.7917, "step": 984600 }, { "epoch": 57.59489969000409, "grad_norm": 0.18129271268844604, "learning_rate": 0.001, "loss": 1.7929, "step": 984700 }, { "epoch": 57.600748669357195, "grad_norm": 0.15858577191829681, "learning_rate": 0.001, "loss": 1.7996, "step": 984800 }, { "epoch": 57.6065976487103, "grad_norm": 0.20689888298511505, "learning_rate": 0.001, "loss": 1.8035, "step": 984900 }, { "epoch": 57.6124466280634, "grad_norm": 0.1665891706943512, "learning_rate": 0.001, "loss": 1.799, "step": 985000 }, { "epoch": 57.618295607416506, "grad_norm": 0.18070203065872192, "learning_rate": 0.001, "loss": 1.794, "step": 985100 }, { "epoch": 57.62414458676961, "grad_norm": 0.1721116155385971, "learning_rate": 0.001, "loss": 1.8013, "step": 985200 }, { "epoch": 57.629993566122714, "grad_norm": 0.23909009993076324, "learning_rate": 0.001, "loss": 1.7972, "step": 985300 }, { "epoch": 57.63584254547582, "grad_norm": 0.2481597661972046, "learning_rate": 0.001, "loss": 1.788, "step": 985400 }, { "epoch": 57.641691524828914, "grad_norm": 0.19998714327812195, "learning_rate": 0.001, "loss": 1.7926, "step": 985500 }, { "epoch": 57.64754050418202, "grad_norm": 0.236686110496521, "learning_rate": 0.001, "loss": 1.7972, "step": 985600 }, { "epoch": 57.65338948353512, "grad_norm": 0.1811768263578415, "learning_rate": 0.001, "loss": 1.7984, "step": 985700 }, { "epoch": 57.659238462888226, "grad_norm": 0.22414395213127136, "learning_rate": 0.001, "loss": 1.7921, "step": 985800 }, { "epoch": 57.66508744224133, "grad_norm": 0.1857810914516449, "learning_rate": 0.001, "loss": 1.7986, "step": 985900 }, { "epoch": 57.67093642159443, "grad_norm": 0.21756306290626526, "learning_rate": 0.001, "loss": 1.7917, "step": 986000 }, { "epoch": 57.67678540094754, "grad_norm": 0.20181763172149658, "learning_rate": 0.001, "loss": 1.795, "step": 986100 }, { "epoch": 57.68263438030064, "grad_norm": 0.16607040166854858, "learning_rate": 0.001, "loss": 1.7995, "step": 986200 }, { "epoch": 57.68848335965374, "grad_norm": 0.14730995893478394, "learning_rate": 0.001, "loss": 1.7968, "step": 986300 }, { "epoch": 57.69433233900684, "grad_norm": 0.21450115740299225, "learning_rate": 0.001, "loss": 1.7978, "step": 986400 }, { "epoch": 57.700181318359945, "grad_norm": 0.17240911722183228, "learning_rate": 0.001, "loss": 1.7978, "step": 986500 }, { "epoch": 57.70603029771305, "grad_norm": 0.1963808834552765, "learning_rate": 0.001, "loss": 1.8027, "step": 986600 }, { "epoch": 57.71187927706615, "grad_norm": 0.15165004134178162, "learning_rate": 0.001, "loss": 1.7989, "step": 986700 }, { "epoch": 57.71772825641926, "grad_norm": 0.1649322360754013, "learning_rate": 0.001, "loss": 1.7997, "step": 986800 }, { "epoch": 57.72357723577236, "grad_norm": 0.2001841515302658, "learning_rate": 0.001, "loss": 1.7988, "step": 986900 }, { "epoch": 57.72942621512546, "grad_norm": 0.18617156147956848, "learning_rate": 0.001, "loss": 1.8033, "step": 987000 }, { "epoch": 57.73527519447856, "grad_norm": 0.16642005741596222, "learning_rate": 0.001, "loss": 1.7919, "step": 987100 }, { "epoch": 57.741124173831665, "grad_norm": 0.14426462352275848, "learning_rate": 0.001, "loss": 1.7926, "step": 987200 }, { "epoch": 57.74697315318477, "grad_norm": 0.17606601119041443, "learning_rate": 0.001, "loss": 1.7925, "step": 987300 }, { "epoch": 57.75282213253787, "grad_norm": 0.228191077709198, "learning_rate": 0.001, "loss": 1.7997, "step": 987400 }, { "epoch": 57.758671111890976, "grad_norm": 0.20038066804409027, "learning_rate": 0.001, "loss": 1.8037, "step": 987500 }, { "epoch": 57.76452009124408, "grad_norm": 0.15480256080627441, "learning_rate": 0.001, "loss": 1.7998, "step": 987600 }, { "epoch": 57.770369070597184, "grad_norm": 0.17389029264450073, "learning_rate": 0.001, "loss": 1.7993, "step": 987700 }, { "epoch": 57.77621804995028, "grad_norm": 0.22661566734313965, "learning_rate": 0.001, "loss": 1.7997, "step": 987800 }, { "epoch": 57.782067029303384, "grad_norm": 0.19903163611888885, "learning_rate": 0.001, "loss": 1.7997, "step": 987900 }, { "epoch": 57.78791600865649, "grad_norm": 0.2816971242427826, "learning_rate": 0.001, "loss": 1.7971, "step": 988000 }, { "epoch": 57.79376498800959, "grad_norm": 0.171045184135437, "learning_rate": 0.001, "loss": 1.8006, "step": 988100 }, { "epoch": 57.799613967362696, "grad_norm": 0.276169091463089, "learning_rate": 0.001, "loss": 1.7961, "step": 988200 }, { "epoch": 57.8054629467158, "grad_norm": 0.25031641125679016, "learning_rate": 0.001, "loss": 1.8023, "step": 988300 }, { "epoch": 57.8113119260689, "grad_norm": 0.17864516377449036, "learning_rate": 0.001, "loss": 1.7977, "step": 988400 }, { "epoch": 57.81716090542201, "grad_norm": 0.1495809257030487, "learning_rate": 0.001, "loss": 1.7959, "step": 988500 }, { "epoch": 57.823009884775104, "grad_norm": 0.19558803737163544, "learning_rate": 0.001, "loss": 1.8012, "step": 988600 }, { "epoch": 57.82885886412821, "grad_norm": 0.1895413100719452, "learning_rate": 0.001, "loss": 1.7948, "step": 988700 }, { "epoch": 57.83470784348131, "grad_norm": 0.18765589594841003, "learning_rate": 0.001, "loss": 1.7957, "step": 988800 }, { "epoch": 57.840556822834415, "grad_norm": 0.18040528893470764, "learning_rate": 0.001, "loss": 1.797, "step": 988900 }, { "epoch": 57.84640580218752, "grad_norm": 0.1719161421060562, "learning_rate": 0.001, "loss": 1.7991, "step": 989000 }, { "epoch": 57.85225478154062, "grad_norm": 0.1651698499917984, "learning_rate": 0.001, "loss": 1.7981, "step": 989100 }, { "epoch": 57.85810376089373, "grad_norm": 0.21757414937019348, "learning_rate": 0.001, "loss": 1.7943, "step": 989200 }, { "epoch": 57.86395274024683, "grad_norm": 0.19320259988307953, "learning_rate": 0.001, "loss": 1.8017, "step": 989300 }, { "epoch": 57.86980171959993, "grad_norm": 0.17143405973911285, "learning_rate": 0.001, "loss": 1.7973, "step": 989400 }, { "epoch": 57.87565069895303, "grad_norm": 0.19341327250003815, "learning_rate": 0.001, "loss": 1.7961, "step": 989500 }, { "epoch": 57.881499678306135, "grad_norm": 0.16815494000911713, "learning_rate": 0.001, "loss": 1.8047, "step": 989600 }, { "epoch": 57.88734865765924, "grad_norm": 0.1444270759820938, "learning_rate": 0.001, "loss": 1.7986, "step": 989700 }, { "epoch": 57.89319763701234, "grad_norm": 0.17974592745304108, "learning_rate": 0.001, "loss": 1.7968, "step": 989800 }, { "epoch": 57.899046616365446, "grad_norm": 0.18357791006565094, "learning_rate": 0.001, "loss": 1.7958, "step": 989900 }, { "epoch": 57.90489559571855, "grad_norm": 0.20362992584705353, "learning_rate": 0.001, "loss": 1.7961, "step": 990000 }, { "epoch": 57.91074457507165, "grad_norm": 0.16725744307041168, "learning_rate": 0.001, "loss": 1.8026, "step": 990100 }, { "epoch": 57.91659355442475, "grad_norm": 0.15524034202098846, "learning_rate": 0.001, "loss": 1.7995, "step": 990200 }, { "epoch": 57.922442533777854, "grad_norm": 0.1712902933359146, "learning_rate": 0.001, "loss": 1.7993, "step": 990300 }, { "epoch": 57.92829151313096, "grad_norm": 0.2088298201560974, "learning_rate": 0.001, "loss": 1.7989, "step": 990400 }, { "epoch": 57.93414049248406, "grad_norm": 0.17944292724132538, "learning_rate": 0.001, "loss": 1.8011, "step": 990500 }, { "epoch": 57.939989471837166, "grad_norm": 0.16292689740657806, "learning_rate": 0.001, "loss": 1.7963, "step": 990600 }, { "epoch": 57.94583845119027, "grad_norm": 0.19018042087554932, "learning_rate": 0.001, "loss": 1.7957, "step": 990700 }, { "epoch": 57.95168743054337, "grad_norm": 0.15654079616069794, "learning_rate": 0.001, "loss": 1.8037, "step": 990800 }, { "epoch": 57.95753640989647, "grad_norm": 0.2005060464143753, "learning_rate": 0.001, "loss": 1.799, "step": 990900 }, { "epoch": 57.963385389249574, "grad_norm": 0.2183188498020172, "learning_rate": 0.001, "loss": 1.7984, "step": 991000 }, { "epoch": 57.96923436860268, "grad_norm": 0.20074030756950378, "learning_rate": 0.001, "loss": 1.8012, "step": 991100 }, { "epoch": 57.97508334795578, "grad_norm": 0.1872839629650116, "learning_rate": 0.001, "loss": 1.7966, "step": 991200 }, { "epoch": 57.980932327308885, "grad_norm": 0.20565950870513916, "learning_rate": 0.001, "loss": 1.7964, "step": 991300 }, { "epoch": 57.98678130666199, "grad_norm": 0.16628313064575195, "learning_rate": 0.001, "loss": 1.8016, "step": 991400 }, { "epoch": 57.99263028601509, "grad_norm": 0.1948552280664444, "learning_rate": 0.001, "loss": 1.801, "step": 991500 }, { "epoch": 57.998479265368196, "grad_norm": 0.1819077879190445, "learning_rate": 0.001, "loss": 1.7966, "step": 991600 }, { "epoch": 58.00432824472129, "grad_norm": 0.20730891823768616, "learning_rate": 0.001, "loss": 1.7864, "step": 991700 }, { "epoch": 58.0101772240744, "grad_norm": 0.16970226168632507, "learning_rate": 0.001, "loss": 1.7839, "step": 991800 }, { "epoch": 58.0160262034275, "grad_norm": 0.20718960464000702, "learning_rate": 0.001, "loss": 1.7923, "step": 991900 }, { "epoch": 58.021875182780605, "grad_norm": 0.20178617537021637, "learning_rate": 0.001, "loss": 1.7896, "step": 992000 }, { "epoch": 58.02772416213371, "grad_norm": 0.1936579942703247, "learning_rate": 0.001, "loss": 1.7895, "step": 992100 }, { "epoch": 58.03357314148681, "grad_norm": 0.24504747986793518, "learning_rate": 0.001, "loss": 1.7824, "step": 992200 }, { "epoch": 58.039422120839916, "grad_norm": 0.25443318486213684, "learning_rate": 0.001, "loss": 1.7882, "step": 992300 }, { "epoch": 58.04527110019302, "grad_norm": 0.23598027229309082, "learning_rate": 0.001, "loss": 1.7898, "step": 992400 }, { "epoch": 58.051120079546116, "grad_norm": 0.19523270428180695, "learning_rate": 0.001, "loss": 1.7867, "step": 992500 }, { "epoch": 58.05696905889922, "grad_norm": 0.2026396095752716, "learning_rate": 0.001, "loss": 1.7894, "step": 992600 }, { "epoch": 58.062818038252324, "grad_norm": 0.2406235784292221, "learning_rate": 0.001, "loss": 1.7858, "step": 992700 }, { "epoch": 58.06866701760543, "grad_norm": 0.21544249355793, "learning_rate": 0.001, "loss": 1.7837, "step": 992800 }, { "epoch": 58.07451599695853, "grad_norm": 0.19145609438419342, "learning_rate": 0.001, "loss": 1.7879, "step": 992900 }, { "epoch": 58.080364976311635, "grad_norm": 0.17880433797836304, "learning_rate": 0.001, "loss": 1.7844, "step": 993000 }, { "epoch": 58.08621395566474, "grad_norm": 0.19794540107250214, "learning_rate": 0.001, "loss": 1.7832, "step": 993100 }, { "epoch": 58.092062935017836, "grad_norm": 0.30354297161102295, "learning_rate": 0.001, "loss": 1.7928, "step": 993200 }, { "epoch": 58.09791191437094, "grad_norm": 0.2844955027103424, "learning_rate": 0.001, "loss": 1.7848, "step": 993300 }, { "epoch": 58.10376089372404, "grad_norm": 0.18572160601615906, "learning_rate": 0.001, "loss": 1.7854, "step": 993400 }, { "epoch": 58.10960987307715, "grad_norm": 0.21462774276733398, "learning_rate": 0.001, "loss": 1.7895, "step": 993500 }, { "epoch": 58.11545885243025, "grad_norm": 0.19692137837409973, "learning_rate": 0.001, "loss": 1.7874, "step": 993600 }, { "epoch": 58.121307831783355, "grad_norm": 0.28761252760887146, "learning_rate": 0.001, "loss": 1.7943, "step": 993700 }, { "epoch": 58.12715681113646, "grad_norm": 0.18855708837509155, "learning_rate": 0.001, "loss": 1.786, "step": 993800 }, { "epoch": 58.13300579048956, "grad_norm": 0.23806524276733398, "learning_rate": 0.001, "loss": 1.789, "step": 993900 }, { "epoch": 58.13885476984266, "grad_norm": 0.1905275583267212, "learning_rate": 0.001, "loss": 1.7885, "step": 994000 }, { "epoch": 58.14470374919576, "grad_norm": 0.23639334738254547, "learning_rate": 0.001, "loss": 1.7917, "step": 994100 }, { "epoch": 58.15055272854887, "grad_norm": 0.17188610136508942, "learning_rate": 0.001, "loss": 1.7846, "step": 994200 }, { "epoch": 58.15640170790197, "grad_norm": 0.16511264443397522, "learning_rate": 0.001, "loss": 1.793, "step": 994300 }, { "epoch": 58.162250687255074, "grad_norm": 0.1985061913728714, "learning_rate": 0.001, "loss": 1.7848, "step": 994400 }, { "epoch": 58.16809966660818, "grad_norm": 0.23552744090557098, "learning_rate": 0.001, "loss": 1.79, "step": 994500 }, { "epoch": 58.17394864596128, "grad_norm": 0.19042836129665375, "learning_rate": 0.001, "loss": 1.7885, "step": 994600 }, { "epoch": 58.179797625314386, "grad_norm": 0.2226201295852661, "learning_rate": 0.001, "loss": 1.7876, "step": 994700 }, { "epoch": 58.18564660466748, "grad_norm": 0.18872396647930145, "learning_rate": 0.001, "loss": 1.7895, "step": 994800 }, { "epoch": 58.191495584020586, "grad_norm": 0.16777920722961426, "learning_rate": 0.001, "loss": 1.7875, "step": 994900 }, { "epoch": 58.19734456337369, "grad_norm": 0.2091732770204544, "learning_rate": 0.001, "loss": 1.7901, "step": 995000 }, { "epoch": 58.203193542726794, "grad_norm": 0.18452095985412598, "learning_rate": 0.001, "loss": 1.7912, "step": 995100 }, { "epoch": 58.2090425220799, "grad_norm": 0.19099709391593933, "learning_rate": 0.001, "loss": 1.7881, "step": 995200 }, { "epoch": 58.214891501433, "grad_norm": 0.38942059874534607, "learning_rate": 0.001, "loss": 1.7907, "step": 995300 }, { "epoch": 58.220740480786105, "grad_norm": 0.21196553111076355, "learning_rate": 0.001, "loss": 1.7902, "step": 995400 }, { "epoch": 58.22658946013921, "grad_norm": 0.22476233541965485, "learning_rate": 0.001, "loss": 1.7896, "step": 995500 }, { "epoch": 58.232438439492306, "grad_norm": 0.20390591025352478, "learning_rate": 0.001, "loss": 1.7894, "step": 995600 }, { "epoch": 58.23828741884541, "grad_norm": 0.26947924494743347, "learning_rate": 0.001, "loss": 1.7841, "step": 995700 }, { "epoch": 58.24413639819851, "grad_norm": 0.19086132943630219, "learning_rate": 0.001, "loss": 1.7926, "step": 995800 }, { "epoch": 58.24998537755162, "grad_norm": 0.24325665831565857, "learning_rate": 0.001, "loss": 1.7923, "step": 995900 }, { "epoch": 58.25583435690472, "grad_norm": 0.20660124719142914, "learning_rate": 0.001, "loss": 1.7948, "step": 996000 }, { "epoch": 58.261683336257825, "grad_norm": 0.2020864188671112, "learning_rate": 0.001, "loss": 1.7888, "step": 996100 }, { "epoch": 58.26753231561093, "grad_norm": 0.1877238154411316, "learning_rate": 0.001, "loss": 1.79, "step": 996200 }, { "epoch": 58.273381294964025, "grad_norm": 0.26328200101852417, "learning_rate": 0.001, "loss": 1.793, "step": 996300 }, { "epoch": 58.27923027431713, "grad_norm": 0.18914644420146942, "learning_rate": 0.001, "loss": 1.7909, "step": 996400 }, { "epoch": 58.28507925367023, "grad_norm": 0.1673954427242279, "learning_rate": 0.001, "loss": 1.7953, "step": 996500 }, { "epoch": 58.29092823302334, "grad_norm": 0.21645961701869965, "learning_rate": 0.001, "loss": 1.7911, "step": 996600 }, { "epoch": 58.29677721237644, "grad_norm": 0.18976616859436035, "learning_rate": 0.001, "loss": 1.786, "step": 996700 }, { "epoch": 58.302626191729544, "grad_norm": 0.26110970973968506, "learning_rate": 0.001, "loss": 1.7919, "step": 996800 }, { "epoch": 58.30847517108265, "grad_norm": 0.19269371032714844, "learning_rate": 0.001, "loss": 1.7935, "step": 996900 }, { "epoch": 58.31432415043575, "grad_norm": 0.2263416349887848, "learning_rate": 0.001, "loss": 1.7893, "step": 997000 }, { "epoch": 58.32017312978885, "grad_norm": 0.19634316861629486, "learning_rate": 0.001, "loss": 1.7912, "step": 997100 }, { "epoch": 58.32602210914195, "grad_norm": 0.2170255035161972, "learning_rate": 0.001, "loss": 1.7966, "step": 997200 }, { "epoch": 58.331871088495056, "grad_norm": 0.2111007273197174, "learning_rate": 0.001, "loss": 1.7912, "step": 997300 }, { "epoch": 58.33772006784816, "grad_norm": 0.23544134199619293, "learning_rate": 0.001, "loss": 1.791, "step": 997400 }, { "epoch": 58.343569047201264, "grad_norm": 0.2463575303554535, "learning_rate": 0.001, "loss": 1.7961, "step": 997500 }, { "epoch": 58.34941802655437, "grad_norm": 0.19556322693824768, "learning_rate": 0.001, "loss": 1.7905, "step": 997600 }, { "epoch": 58.35526700590747, "grad_norm": 0.2812010645866394, "learning_rate": 0.001, "loss": 1.7877, "step": 997700 }, { "epoch": 58.361115985260575, "grad_norm": 0.20039044320583344, "learning_rate": 0.001, "loss": 1.7948, "step": 997800 }, { "epoch": 58.36696496461367, "grad_norm": 0.20686565339565277, "learning_rate": 0.001, "loss": 1.795, "step": 997900 }, { "epoch": 58.372813943966776, "grad_norm": 0.2836202383041382, "learning_rate": 0.001, "loss": 1.7921, "step": 998000 }, { "epoch": 58.37866292331988, "grad_norm": 0.17781962454319, "learning_rate": 0.001, "loss": 1.7922, "step": 998100 }, { "epoch": 58.38451190267298, "grad_norm": 0.24799463152885437, "learning_rate": 0.001, "loss": 1.7883, "step": 998200 }, { "epoch": 58.39036088202609, "grad_norm": 0.27403295040130615, "learning_rate": 0.001, "loss": 1.7949, "step": 998300 }, { "epoch": 58.39620986137919, "grad_norm": 0.1786080300807953, "learning_rate": 0.001, "loss": 1.7923, "step": 998400 }, { "epoch": 58.402058840732295, "grad_norm": 0.1971951723098755, "learning_rate": 0.001, "loss": 1.7939, "step": 998500 }, { "epoch": 58.4079078200854, "grad_norm": 0.2548580765724182, "learning_rate": 0.001, "loss": 1.7931, "step": 998600 }, { "epoch": 58.413756799438495, "grad_norm": 0.1722784787416458, "learning_rate": 0.001, "loss": 1.7922, "step": 998700 }, { "epoch": 58.4196057787916, "grad_norm": 0.20509743690490723, "learning_rate": 0.001, "loss": 1.7937, "step": 998800 }, { "epoch": 58.4254547581447, "grad_norm": 0.22023554146289825, "learning_rate": 0.001, "loss": 1.7942, "step": 998900 }, { "epoch": 58.43130373749781, "grad_norm": 0.20146839320659637, "learning_rate": 0.001, "loss": 1.7846, "step": 999000 }, { "epoch": 58.43715271685091, "grad_norm": 0.19283144176006317, "learning_rate": 0.001, "loss": 1.7944, "step": 999100 }, { "epoch": 58.443001696204014, "grad_norm": 0.2368447333574295, "learning_rate": 0.001, "loss": 1.7987, "step": 999200 }, { "epoch": 58.44885067555712, "grad_norm": 0.1786475032567978, "learning_rate": 0.001, "loss": 1.793, "step": 999300 }, { "epoch": 58.454699654910215, "grad_norm": 0.29527977108955383, "learning_rate": 0.001, "loss": 1.793, "step": 999400 }, { "epoch": 58.46054863426332, "grad_norm": 0.21667225658893585, "learning_rate": 0.001, "loss": 1.7882, "step": 999500 }, { "epoch": 58.46639761361642, "grad_norm": 0.39455923438072205, "learning_rate": 0.001, "loss": 1.7883, "step": 999600 }, { "epoch": 58.472246592969526, "grad_norm": 0.17648516595363617, "learning_rate": 0.001, "loss": 1.7936, "step": 999700 }, { "epoch": 58.47809557232263, "grad_norm": 0.21986538171768188, "learning_rate": 0.001, "loss": 1.7979, "step": 999800 }, { "epoch": 58.483944551675734, "grad_norm": 0.3130912780761719, "learning_rate": 0.001, "loss": 1.7946, "step": 999900 }, { "epoch": 58.48979353102884, "grad_norm": 0.1854136735200882, "learning_rate": 0.001, "loss": 1.7874, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_ag_news_accuracy": 0.24446875, "eval_ag_news_bleu_score": 7.577550748929797, "eval_ag_news_bleu_score_sem": 0.5401568028907812, "eval_ag_news_emb_cos_sim": 0.7069768309593201, "eval_ag_news_emb_cos_sim_sem": 0.014590807259082794, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.67460298538208, "eval_ag_news_n_ngrams_match_1": 15.015625, "eval_ag_news_n_ngrams_match_2": 4.3515625, "eval_ag_news_n_ngrams_match_3": 1.6875, "eval_ag_news_num_pred_words": 48.65625, "eval_ag_news_num_true_words": 46.953125, "eval_ag_news_perplexity": 14.50658936813558, "eval_ag_news_pred_num_tokens": 72.328125, "eval_ag_news_rouge_score": 0.3045987958145773, "eval_ag_news_runtime": 36.5739, "eval_ag_news_samples_per_second": 13.671, "eval_ag_news_steps_per_second": 0.027, "eval_ag_news_token_set_f1": 0.3369229003929352, "eval_ag_news_token_set_f1_sem": 0.010305310567581431, "eval_ag_news_token_set_precision": 0.3188398558788751, "eval_ag_news_token_set_recall": 0.3654789640861349, "eval_ag_news_true_num_tokens": 64.421875, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_anthropic_toxic_prompts_accuracy": 0.10378125, "eval_anthropic_toxic_prompts_bleu_score": 42.939325447574355, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.5572503679458514, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8926401138305664, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010754554532468319, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02934655810211727, "eval_anthropic_toxic_prompts_loss": 1.227657675743103, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 8.84375, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.6328125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.6328125, "eval_anthropic_toxic_prompts_num_pred_words": 14.546875, "eval_anthropic_toxic_prompts_num_true_words": 13.890625, "eval_anthropic_toxic_prompts_perplexity": 3.4132252852936418, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.890625, "eval_anthropic_toxic_prompts_rouge_score": 0.6843025020333451, "eval_anthropic_toxic_prompts_runtime": 29.4431, "eval_anthropic_toxic_prompts_samples_per_second": 16.982, "eval_anthropic_toxic_prompts_steps_per_second": 0.034, "eval_anthropic_toxic_prompts_token_set_f1": 0.7032119095717161, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018387995109750383, "eval_anthropic_toxic_prompts_token_set_precision": 0.7016985009481396, "eval_anthropic_toxic_prompts_token_set_recall": 0.7106988331827486, "eval_anthropic_toxic_prompts_true_num_tokens": 17.125, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_arxiv_accuracy": 0.3790625, "eval_arxiv_bleu_score": 2.062984358688617, "eval_arxiv_bleu_score_sem": 0.2064978395678187, "eval_arxiv_emb_cos_sim": 0.509061336517334, "eval_arxiv_emb_cos_sim_sem": 0.018436487764120102, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4101033210754395, "eval_arxiv_n_ngrams_match_1": 14.4921875, "eval_arxiv_n_ngrams_match_2": 2.6875, "eval_arxiv_n_ngrams_match_3": 0.6640625, "eval_arxiv_num_pred_words": 58.671875, "eval_arxiv_num_true_words": 86.3359375, "eval_arxiv_perplexity": 30.26837145853541, "eval_arxiv_pred_num_tokens": 125.8984375, "eval_arxiv_rouge_score": 0.1869979824040406, "eval_arxiv_runtime": 31.1744, "eval_arxiv_samples_per_second": 16.039, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.19300360916719644, "eval_arxiv_token_set_f1_sem": 0.007618493263883346, "eval_arxiv_token_set_precision": 0.13181259919026508, "eval_arxiv_token_set_recall": 0.4489149945847865, "eval_arxiv_true_num_tokens": 124.7578125, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_python_code_alpaca_accuracy": 0.13075, "eval_python_code_alpaca_bleu_score": 29.371161417655074, "eval_python_code_alpaca_bleu_score_sem": 1.5305740996781771, "eval_python_code_alpaca_emb_cos_sim": 0.8791770935058594, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008670511655509472, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4992409944534302, "eval_python_code_alpaca_n_ngrams_match_1": 10.6953125, "eval_python_code_alpaca_n_ngrams_match_2": 5.9296875, "eval_python_code_alpaca_n_ngrams_match_3": 3.5, "eval_python_code_alpaca_num_pred_words": 18.2265625, "eval_python_code_alpaca_num_true_words": 19.1484375, "eval_python_code_alpaca_perplexity": 4.47828873407596, "eval_python_code_alpaca_pred_num_tokens": 23.890625, "eval_python_code_alpaca_rouge_score": 0.5977831929615295, "eval_python_code_alpaca_runtime": 29.6371, "eval_python_code_alpaca_samples_per_second": 16.871, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6182276485219702, "eval_python_code_alpaca_token_set_f1_sem": 0.013498606462231666, "eval_python_code_alpaca_token_set_precision": 0.6010155549907374, "eval_python_code_alpaca_token_set_recall": 0.6418009649553779, "eval_python_code_alpaca_true_num_tokens": 24.484375, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_wikibio_accuracy": 0.368890625, "eval_wikibio_bleu_score": 7.499527801897232, "eval_wikibio_bleu_score_sem": 0.6721926974228882, "eval_wikibio_emb_cos_sim": 0.6400585770606995, "eval_wikibio_emb_cos_sim_sem": 0.02130083553493023, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.685126781463623, "eval_wikibio_n_ngrams_match_1": 15.40625, "eval_wikibio_n_ngrams_match_2": 5.2890625, "eval_wikibio_n_ngrams_match_3": 2.0546875, "eval_wikibio_num_pred_words": 51.171875, "eval_wikibio_num_true_words": 52.203125, "eval_wikibio_perplexity": 14.660059886297342, "eval_wikibio_pred_num_tokens": 105.9609375, "eval_wikibio_rouge_score": 0.31439407434033195, "eval_wikibio_runtime": 30.6112, "eval_wikibio_samples_per_second": 16.334, "eval_wikibio_steps_per_second": 0.033, "eval_wikibio_token_set_f1": 0.33827109632426927, "eval_wikibio_token_set_f1_sem": 0.010794774536193649, "eval_wikibio_token_set_precision": 0.2990542245316439, "eval_wikibio_token_set_recall": 0.42571602092347144, "eval_wikibio_true_num_tokens": 99.6640625, "step": 1000000 }, { "epoch": 58.48979353102884, "eval_msmarco_accuracy": 0.405984375, "eval_msmarco_bleu_score": 19.475593410908893, "eval_msmarco_bleu_score_sem": 1.6236368027205879, "eval_msmarco_emb_cos_sim": 0.800751805305481, "eval_msmarco_emb_cos_sim_sem": 0.016275888308882713, "eval_msmarco_emb_top1_equal": 0.921875, "eval_msmarco_emb_top1_equal_sem": 0.023813825100660324, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.6990176439285278, "eval_msmarco_n_ngrams_match_1": 29.6796875, "eval_msmarco_n_ngrams_match_2": 14.171875, "eval_msmarco_n_ngrams_match_3": 8.4375, "eval_msmarco_num_pred_words": 62.9453125, "eval_msmarco_num_true_words": 62.578125, "eval_msmarco_perplexity": 5.468572666651151, "eval_msmarco_pred_num_tokens": 83.2109375, "eval_msmarco_rouge_score": 0.4710335429438851, "eval_msmarco_runtime": 24.152, "eval_msmarco_samples_per_second": 20.702, "eval_msmarco_steps_per_second": 0.041, "eval_msmarco_token_set_f1": 0.49669042278437986, "eval_msmarco_token_set_f1_sem": 0.014491218326063253, "eval_msmarco_token_set_precision": 0.46033920848672527, "eval_msmarco_token_set_recall": 0.5569982575216046, "eval_msmarco_true_num_tokens": 80.4296875, "step": 1000000 }, { "epoch": 58.49564251038194, "grad_norm": 0.19965922832489014, "learning_rate": 0.001, "loss": 1.7911, "step": 1000100 }, { "epoch": 58.50149148973504, "grad_norm": 0.21057389676570892, "learning_rate": 0.001, "loss": 1.7956, "step": 1000200 }, { "epoch": 58.50734046908814, "grad_norm": 0.18816106021404266, "learning_rate": 0.001, "loss": 1.7977, "step": 1000300 }, { "epoch": 58.513189448441246, "grad_norm": 0.21429957449436188, "learning_rate": 0.001, "loss": 1.7883, "step": 1000400 }, { "epoch": 58.51903842779435, "grad_norm": 0.2794620394706726, "learning_rate": 0.001, "loss": 1.7976, "step": 1000500 }, { "epoch": 58.52488740714745, "grad_norm": 0.19862252473831177, "learning_rate": 0.001, "loss": 1.7919, "step": 1000600 }, { "epoch": 58.53073638650056, "grad_norm": 0.19018499553203583, "learning_rate": 0.001, "loss": 1.7979, "step": 1000700 }, { "epoch": 58.53658536585366, "grad_norm": 0.18953603506088257, "learning_rate": 0.001, "loss": 1.7927, "step": 1000800 }, { "epoch": 58.542434345206765, "grad_norm": 0.2472999542951584, "learning_rate": 0.001, "loss": 1.7921, "step": 1000900 }, { "epoch": 58.54828332455986, "grad_norm": 0.18186460435390472, "learning_rate": 0.001, "loss": 1.7981, "step": 1001000 }, { "epoch": 58.554132303912965, "grad_norm": 0.2262032926082611, "learning_rate": 0.001, "loss": 1.7928, "step": 1001100 }, { "epoch": 58.55998128326607, "grad_norm": 0.2062881737947464, "learning_rate": 0.001, "loss": 1.7947, "step": 1001200 }, { "epoch": 58.56583026261917, "grad_norm": 0.21281084418296814, "learning_rate": 0.001, "loss": 1.7954, "step": 1001300 }, { "epoch": 58.57167924197228, "grad_norm": 0.17713908851146698, "learning_rate": 0.001, "loss": 1.7927, "step": 1001400 }, { "epoch": 58.57752822132538, "grad_norm": 0.20420493185520172, "learning_rate": 0.001, "loss": 1.7909, "step": 1001500 }, { "epoch": 58.583377200678484, "grad_norm": 0.16088859736919403, "learning_rate": 0.001, "loss": 1.7925, "step": 1001600 }, { "epoch": 58.58922618003159, "grad_norm": 0.393774151802063, "learning_rate": 0.001, "loss": 1.7904, "step": 1001700 }, { "epoch": 58.595075159384685, "grad_norm": 0.1628478765487671, "learning_rate": 0.001, "loss": 1.7952, "step": 1001800 }, { "epoch": 58.60092413873779, "grad_norm": 0.24056489765644073, "learning_rate": 0.001, "loss": 1.7929, "step": 1001900 }, { "epoch": 58.60677311809089, "grad_norm": 0.21348243951797485, "learning_rate": 0.001, "loss": 1.7914, "step": 1002000 }, { "epoch": 58.612622097443996, "grad_norm": 0.22409853339195251, "learning_rate": 0.001, "loss": 1.7973, "step": 1002100 }, { "epoch": 58.6184710767971, "grad_norm": 0.18913504481315613, "learning_rate": 0.001, "loss": 1.7995, "step": 1002200 }, { "epoch": 58.624320056150204, "grad_norm": 0.18405993282794952, "learning_rate": 0.001, "loss": 1.7948, "step": 1002300 }, { "epoch": 58.63016903550331, "grad_norm": 0.21252334117889404, "learning_rate": 0.001, "loss": 1.781, "step": 1002400 }, { "epoch": 58.636018014856404, "grad_norm": 0.2653558552265167, "learning_rate": 0.001, "loss": 1.8029, "step": 1002500 }, { "epoch": 58.64186699420951, "grad_norm": 0.1534534990787506, "learning_rate": 0.001, "loss": 1.7934, "step": 1002600 }, { "epoch": 58.64771597356261, "grad_norm": 0.18723970651626587, "learning_rate": 0.001, "loss": 1.7861, "step": 1002700 }, { "epoch": 58.653564952915715, "grad_norm": 0.18101894855499268, "learning_rate": 0.001, "loss": 1.7947, "step": 1002800 }, { "epoch": 58.65941393226882, "grad_norm": 0.2746950089931488, "learning_rate": 0.001, "loss": 1.7978, "step": 1002900 }, { "epoch": 58.66526291162192, "grad_norm": 0.2557464838027954, "learning_rate": 0.001, "loss": 1.7932, "step": 1003000 }, { "epoch": 58.67111189097503, "grad_norm": 0.2113916128873825, "learning_rate": 0.001, "loss": 1.7921, "step": 1003100 }, { "epoch": 58.67696087032813, "grad_norm": 0.18236058950424194, "learning_rate": 0.001, "loss": 1.7957, "step": 1003200 }, { "epoch": 58.68280984968123, "grad_norm": 0.2025773674249649, "learning_rate": 0.001, "loss": 1.8018, "step": 1003300 }, { "epoch": 58.68865882903433, "grad_norm": 0.18123279511928558, "learning_rate": 0.001, "loss": 1.7976, "step": 1003400 }, { "epoch": 58.694507808387435, "grad_norm": 0.19493559002876282, "learning_rate": 0.001, "loss": 1.8008, "step": 1003500 }, { "epoch": 58.70035678774054, "grad_norm": 0.2011268436908722, "learning_rate": 0.001, "loss": 1.794, "step": 1003600 }, { "epoch": 58.70620576709364, "grad_norm": 0.1991853564977646, "learning_rate": 0.001, "loss": 1.7932, "step": 1003700 }, { "epoch": 58.712054746446746, "grad_norm": 0.20841452479362488, "learning_rate": 0.001, "loss": 1.7942, "step": 1003800 }, { "epoch": 58.71790372579985, "grad_norm": 0.2040657103061676, "learning_rate": 0.001, "loss": 1.794, "step": 1003900 }, { "epoch": 58.723752705152954, "grad_norm": 0.20011647045612335, "learning_rate": 0.001, "loss": 1.7981, "step": 1004000 }, { "epoch": 58.72960168450605, "grad_norm": 0.18750928342342377, "learning_rate": 0.001, "loss": 1.7964, "step": 1004100 }, { "epoch": 58.735450663859154, "grad_norm": 0.1728213131427765, "learning_rate": 0.001, "loss": 1.793, "step": 1004200 }, { "epoch": 58.74129964321226, "grad_norm": 0.21521778404712677, "learning_rate": 0.001, "loss": 1.7943, "step": 1004300 }, { "epoch": 58.74714862256536, "grad_norm": 0.18404000997543335, "learning_rate": 0.001, "loss": 1.7952, "step": 1004400 }, { "epoch": 58.752997601918466, "grad_norm": 0.1893298476934433, "learning_rate": 0.001, "loss": 1.7965, "step": 1004500 }, { "epoch": 58.75884658127157, "grad_norm": 0.22568757832050323, "learning_rate": 0.001, "loss": 1.797, "step": 1004600 }, { "epoch": 58.76469556062467, "grad_norm": 0.19903190433979034, "learning_rate": 0.001, "loss": 1.7961, "step": 1004700 }, { "epoch": 58.77054453997778, "grad_norm": 0.2606196999549866, "learning_rate": 0.001, "loss": 1.7958, "step": 1004800 }, { "epoch": 58.776393519330874, "grad_norm": 0.17515574395656586, "learning_rate": 0.001, "loss": 1.7964, "step": 1004900 }, { "epoch": 58.78224249868398, "grad_norm": 0.17974615097045898, "learning_rate": 0.001, "loss": 1.7965, "step": 1005000 }, { "epoch": 58.78809147803708, "grad_norm": 0.22128483653068542, "learning_rate": 0.001, "loss": 1.7974, "step": 1005100 }, { "epoch": 58.793940457390185, "grad_norm": 0.21654653549194336, "learning_rate": 0.001, "loss": 1.7999, "step": 1005200 }, { "epoch": 58.79978943674329, "grad_norm": 0.16464705765247345, "learning_rate": 0.001, "loss": 1.7937, "step": 1005300 }, { "epoch": 58.80563841609639, "grad_norm": 0.17873653769493103, "learning_rate": 0.001, "loss": 1.8014, "step": 1005400 }, { "epoch": 58.8114873954495, "grad_norm": 0.20900359749794006, "learning_rate": 0.001, "loss": 1.7982, "step": 1005500 }, { "epoch": 58.81733637480259, "grad_norm": 0.25107428431510925, "learning_rate": 0.001, "loss": 1.7914, "step": 1005600 }, { "epoch": 58.8231853541557, "grad_norm": 0.19954198598861694, "learning_rate": 0.001, "loss": 1.7883, "step": 1005700 }, { "epoch": 58.8290343335088, "grad_norm": 0.20462766289710999, "learning_rate": 0.001, "loss": 1.7974, "step": 1005800 }, { "epoch": 58.834883312861905, "grad_norm": 0.20146477222442627, "learning_rate": 0.001, "loss": 1.7923, "step": 1005900 }, { "epoch": 58.84073229221501, "grad_norm": 0.20034758746623993, "learning_rate": 0.001, "loss": 1.8002, "step": 1006000 }, { "epoch": 58.84658127156811, "grad_norm": 0.18430808186531067, "learning_rate": 0.001, "loss": 1.7998, "step": 1006100 }, { "epoch": 58.852430250921216, "grad_norm": 0.161350816488266, "learning_rate": 0.001, "loss": 1.7969, "step": 1006200 }, { "epoch": 58.85827923027432, "grad_norm": 0.21445496380329132, "learning_rate": 0.001, "loss": 1.8004, "step": 1006300 }, { "epoch": 58.86412820962742, "grad_norm": 0.20991423726081848, "learning_rate": 0.001, "loss": 1.793, "step": 1006400 }, { "epoch": 58.86997718898052, "grad_norm": 0.1612267643213272, "learning_rate": 0.001, "loss": 1.8034, "step": 1006500 }, { "epoch": 58.875826168333624, "grad_norm": 0.19356951117515564, "learning_rate": 0.001, "loss": 1.7953, "step": 1006600 }, { "epoch": 58.88167514768673, "grad_norm": 0.19052596390247345, "learning_rate": 0.001, "loss": 1.8017, "step": 1006700 }, { "epoch": 58.88752412703983, "grad_norm": 0.26160183548927307, "learning_rate": 0.001, "loss": 1.7967, "step": 1006800 }, { "epoch": 58.893373106392936, "grad_norm": 0.17096367478370667, "learning_rate": 0.001, "loss": 1.7937, "step": 1006900 }, { "epoch": 58.89922208574604, "grad_norm": 0.2321823537349701, "learning_rate": 0.001, "loss": 1.793, "step": 1007000 }, { "epoch": 58.90507106509914, "grad_norm": 0.33643755316734314, "learning_rate": 0.001, "loss": 1.7981, "step": 1007100 }, { "epoch": 58.91092004445224, "grad_norm": 0.24367782473564148, "learning_rate": 0.001, "loss": 1.8014, "step": 1007200 }, { "epoch": 58.916769023805344, "grad_norm": 0.19719922542572021, "learning_rate": 0.001, "loss": 1.7982, "step": 1007300 }, { "epoch": 58.92261800315845, "grad_norm": 0.19778095185756683, "learning_rate": 0.001, "loss": 1.7952, "step": 1007400 }, { "epoch": 58.92846698251155, "grad_norm": 0.23352265357971191, "learning_rate": 0.001, "loss": 1.7976, "step": 1007500 }, { "epoch": 58.934315961864655, "grad_norm": 0.15844735503196716, "learning_rate": 0.001, "loss": 1.7985, "step": 1007600 }, { "epoch": 58.94016494121776, "grad_norm": 0.22580185532569885, "learning_rate": 0.001, "loss": 1.7895, "step": 1007700 }, { "epoch": 58.94601392057086, "grad_norm": 0.22113512456417084, "learning_rate": 0.001, "loss": 1.7993, "step": 1007800 }, { "epoch": 58.95186289992397, "grad_norm": 0.2077968418598175, "learning_rate": 0.001, "loss": 1.7971, "step": 1007900 }, { "epoch": 58.95771187927706, "grad_norm": 0.19124265015125275, "learning_rate": 0.001, "loss": 1.7979, "step": 1008000 }, { "epoch": 58.96356085863017, "grad_norm": 0.17759081721305847, "learning_rate": 0.001, "loss": 1.7914, "step": 1008100 }, { "epoch": 58.96940983798327, "grad_norm": 0.19953452050685883, "learning_rate": 0.001, "loss": 1.7921, "step": 1008200 }, { "epoch": 58.975258817336375, "grad_norm": 0.19346316158771515, "learning_rate": 0.001, "loss": 1.7957, "step": 1008300 }, { "epoch": 58.98110779668948, "grad_norm": 0.215557262301445, "learning_rate": 0.001, "loss": 1.7962, "step": 1008400 }, { "epoch": 58.98695677604258, "grad_norm": 0.19365070760250092, "learning_rate": 0.001, "loss": 1.796, "step": 1008500 }, { "epoch": 58.992805755395686, "grad_norm": 0.17653262615203857, "learning_rate": 0.001, "loss": 1.7962, "step": 1008600 }, { "epoch": 58.99865473474878, "grad_norm": 0.19941292703151703, "learning_rate": 0.001, "loss": 1.8025, "step": 1008700 }, { "epoch": 59.00450371410189, "grad_norm": 0.1605333834886551, "learning_rate": 0.001, "loss": 1.7851, "step": 1008800 }, { "epoch": 59.01035269345499, "grad_norm": 0.21526376903057098, "learning_rate": 0.001, "loss": 1.7885, "step": 1008900 }, { "epoch": 59.016201672808094, "grad_norm": 0.17359444499015808, "learning_rate": 0.001, "loss": 1.7832, "step": 1009000 }, { "epoch": 59.0220506521612, "grad_norm": 0.17815014719963074, "learning_rate": 0.001, "loss": 1.7822, "step": 1009100 }, { "epoch": 59.0278996315143, "grad_norm": 0.15529996156692505, "learning_rate": 0.001, "loss": 1.7824, "step": 1009200 }, { "epoch": 59.033748610867406, "grad_norm": 0.29486319422721863, "learning_rate": 0.001, "loss": 1.7875, "step": 1009300 }, { "epoch": 59.03959759022051, "grad_norm": 0.18095938861370087, "learning_rate": 0.001, "loss": 1.7832, "step": 1009400 }, { "epoch": 59.045446569573606, "grad_norm": 0.16894346475601196, "learning_rate": 0.001, "loss": 1.7843, "step": 1009500 }, { "epoch": 59.05129554892671, "grad_norm": 0.1923302561044693, "learning_rate": 0.001, "loss": 1.7873, "step": 1009600 }, { "epoch": 59.057144528279814, "grad_norm": 0.21019689738750458, "learning_rate": 0.001, "loss": 1.7891, "step": 1009700 }, { "epoch": 59.06299350763292, "grad_norm": 0.24466176331043243, "learning_rate": 0.001, "loss": 1.7852, "step": 1009800 }, { "epoch": 59.06884248698602, "grad_norm": 0.15464641153812408, "learning_rate": 0.001, "loss": 1.7842, "step": 1009900 }, { "epoch": 59.074691466339125, "grad_norm": 0.24237145483493805, "learning_rate": 0.001, "loss": 1.7851, "step": 1010000 }, { "epoch": 59.08054044569223, "grad_norm": 0.16746990382671356, "learning_rate": 0.001, "loss": 1.7845, "step": 1010100 }, { "epoch": 59.08638942504533, "grad_norm": 0.21810761094093323, "learning_rate": 0.001, "loss": 1.7826, "step": 1010200 }, { "epoch": 59.09223840439843, "grad_norm": 0.16082808375358582, "learning_rate": 0.001, "loss": 1.7856, "step": 1010300 }, { "epoch": 59.09808738375153, "grad_norm": 0.162797212600708, "learning_rate": 0.001, "loss": 1.7846, "step": 1010400 }, { "epoch": 59.10393636310464, "grad_norm": 0.17361417412757874, "learning_rate": 0.001, "loss": 1.7861, "step": 1010500 }, { "epoch": 59.10978534245774, "grad_norm": 0.20060007274150848, "learning_rate": 0.001, "loss": 1.7858, "step": 1010600 }, { "epoch": 59.115634321810845, "grad_norm": 0.18389184772968292, "learning_rate": 0.001, "loss": 1.7854, "step": 1010700 }, { "epoch": 59.12148330116395, "grad_norm": 0.15831588208675385, "learning_rate": 0.001, "loss": 1.7856, "step": 1010800 }, { "epoch": 59.12733228051705, "grad_norm": 0.1832629144191742, "learning_rate": 0.001, "loss": 1.7884, "step": 1010900 }, { "epoch": 59.133181259870156, "grad_norm": 0.19644437730312347, "learning_rate": 0.001, "loss": 1.7874, "step": 1011000 }, { "epoch": 59.13903023922325, "grad_norm": 0.18340808153152466, "learning_rate": 0.001, "loss": 1.7881, "step": 1011100 }, { "epoch": 59.14487921857636, "grad_norm": 0.11945226788520813, "learning_rate": 0.001, "loss": 1.7932, "step": 1011200 }, { "epoch": 59.15072819792946, "grad_norm": 0.23662622272968292, "learning_rate": 0.001, "loss": 1.7861, "step": 1011300 }, { "epoch": 59.156577177282564, "grad_norm": 0.1532493531703949, "learning_rate": 0.001, "loss": 1.7882, "step": 1011400 }, { "epoch": 59.16242615663567, "grad_norm": 0.1966126710176468, "learning_rate": 0.001, "loss": 1.7899, "step": 1011500 }, { "epoch": 59.16827513598877, "grad_norm": 0.1699378788471222, "learning_rate": 0.001, "loss": 1.7796, "step": 1011600 }, { "epoch": 59.174124115341876, "grad_norm": 0.15117017924785614, "learning_rate": 0.001, "loss": 1.789, "step": 1011700 }, { "epoch": 59.17997309469497, "grad_norm": 0.1699860841035843, "learning_rate": 0.001, "loss": 1.7846, "step": 1011800 }, { "epoch": 59.185822074048076, "grad_norm": 0.12318763881921768, "learning_rate": 0.001, "loss": 1.7823, "step": 1011900 }, { "epoch": 59.19167105340118, "grad_norm": 0.21107491850852966, "learning_rate": 0.001, "loss": 1.7857, "step": 1012000 }, { "epoch": 59.197520032754284, "grad_norm": 0.13236172497272491, "learning_rate": 0.001, "loss": 1.7894, "step": 1012100 }, { "epoch": 59.20336901210739, "grad_norm": 0.17250901460647583, "learning_rate": 0.001, "loss": 1.7829, "step": 1012200 }, { "epoch": 59.20921799146049, "grad_norm": 0.24841968715190887, "learning_rate": 0.001, "loss": 1.7882, "step": 1012300 }, { "epoch": 59.215066970813595, "grad_norm": 0.18477512896060944, "learning_rate": 0.001, "loss": 1.7948, "step": 1012400 }, { "epoch": 59.2209159501667, "grad_norm": 0.13365976512432098, "learning_rate": 0.001, "loss": 1.7876, "step": 1012500 }, { "epoch": 59.226764929519796, "grad_norm": 0.17096169292926788, "learning_rate": 0.001, "loss": 1.7854, "step": 1012600 }, { "epoch": 59.2326139088729, "grad_norm": 0.16895247995853424, "learning_rate": 0.001, "loss": 1.7855, "step": 1012700 }, { "epoch": 59.238462888226, "grad_norm": 0.15462413430213928, "learning_rate": 0.001, "loss": 1.7923, "step": 1012800 }, { "epoch": 59.24431186757911, "grad_norm": 0.21057824790477753, "learning_rate": 0.001, "loss": 1.7875, "step": 1012900 }, { "epoch": 59.25016084693221, "grad_norm": 0.2039710134267807, "learning_rate": 0.001, "loss": 1.7902, "step": 1013000 }, { "epoch": 59.256009826285315, "grad_norm": 0.17182354629039764, "learning_rate": 0.001, "loss": 1.7953, "step": 1013100 }, { "epoch": 59.26185880563842, "grad_norm": 0.15873458981513977, "learning_rate": 0.001, "loss": 1.7863, "step": 1013200 }, { "epoch": 59.26770778499152, "grad_norm": 0.19244684278964996, "learning_rate": 0.001, "loss": 1.7958, "step": 1013300 }, { "epoch": 59.27355676434462, "grad_norm": 0.12696997821331024, "learning_rate": 0.001, "loss": 1.7867, "step": 1013400 }, { "epoch": 59.27940574369772, "grad_norm": 0.16237923502922058, "learning_rate": 0.001, "loss": 1.7848, "step": 1013500 }, { "epoch": 59.285254723050826, "grad_norm": 0.16602616012096405, "learning_rate": 0.001, "loss": 1.7842, "step": 1013600 }, { "epoch": 59.29110370240393, "grad_norm": 0.18771237134933472, "learning_rate": 0.001, "loss": 1.7891, "step": 1013700 }, { "epoch": 59.296952681757034, "grad_norm": 0.28678449988365173, "learning_rate": 0.001, "loss": 1.7941, "step": 1013800 }, { "epoch": 59.30280166111014, "grad_norm": 0.21822912991046906, "learning_rate": 0.001, "loss": 1.7894, "step": 1013900 }, { "epoch": 59.30865064046324, "grad_norm": 0.16346368193626404, "learning_rate": 0.001, "loss": 1.7933, "step": 1014000 }, { "epoch": 59.314499619816345, "grad_norm": 0.2169182002544403, "learning_rate": 0.001, "loss": 1.7874, "step": 1014100 }, { "epoch": 59.32034859916944, "grad_norm": 0.1723722666501999, "learning_rate": 0.001, "loss": 1.793, "step": 1014200 }, { "epoch": 59.326197578522546, "grad_norm": 0.2753269672393799, "learning_rate": 0.001, "loss": 1.7868, "step": 1014300 }, { "epoch": 59.33204655787565, "grad_norm": 0.16951660811901093, "learning_rate": 0.001, "loss": 1.7889, "step": 1014400 }, { "epoch": 59.33789553722875, "grad_norm": 0.2304544895887375, "learning_rate": 0.001, "loss": 1.7874, "step": 1014500 }, { "epoch": 59.34374451658186, "grad_norm": 0.20261920988559723, "learning_rate": 0.001, "loss": 1.7807, "step": 1014600 }, { "epoch": 59.34959349593496, "grad_norm": 0.1612282693386078, "learning_rate": 0.001, "loss": 1.7904, "step": 1014700 }, { "epoch": 59.355442475288065, "grad_norm": 0.17089231312274933, "learning_rate": 0.001, "loss": 1.7867, "step": 1014800 }, { "epoch": 59.36129145464116, "grad_norm": 0.14829176664352417, "learning_rate": 0.001, "loss": 1.787, "step": 1014900 }, { "epoch": 59.367140433994265, "grad_norm": 0.2575216591358185, "learning_rate": 0.001, "loss": 1.7856, "step": 1015000 }, { "epoch": 59.37298941334737, "grad_norm": 0.16134530305862427, "learning_rate": 0.001, "loss": 1.7933, "step": 1015100 }, { "epoch": 59.37883839270047, "grad_norm": 0.21118372678756714, "learning_rate": 0.001, "loss": 1.7921, "step": 1015200 }, { "epoch": 59.38468737205358, "grad_norm": 0.14325910806655884, "learning_rate": 0.001, "loss": 1.7916, "step": 1015300 }, { "epoch": 59.39053635140668, "grad_norm": 0.20136037468910217, "learning_rate": 0.001, "loss": 1.79, "step": 1015400 }, { "epoch": 59.396385330759784, "grad_norm": 0.2262166291475296, "learning_rate": 0.001, "loss": 1.7903, "step": 1015500 }, { "epoch": 59.40223431011289, "grad_norm": 0.1542498767375946, "learning_rate": 0.001, "loss": 1.7906, "step": 1015600 }, { "epoch": 59.408083289465985, "grad_norm": 0.19068574905395508, "learning_rate": 0.001, "loss": 1.7914, "step": 1015700 }, { "epoch": 59.41393226881909, "grad_norm": 0.162883460521698, "learning_rate": 0.001, "loss": 1.7888, "step": 1015800 }, { "epoch": 59.41978124817219, "grad_norm": 0.2146407514810562, "learning_rate": 0.001, "loss": 1.7913, "step": 1015900 }, { "epoch": 59.425630227525296, "grad_norm": 0.24564608931541443, "learning_rate": 0.001, "loss": 1.7983, "step": 1016000 }, { "epoch": 59.4314792068784, "grad_norm": 0.15846258401870728, "learning_rate": 0.001, "loss": 1.788, "step": 1016100 }, { "epoch": 59.437328186231504, "grad_norm": 0.1523941308259964, "learning_rate": 0.001, "loss": 1.7897, "step": 1016200 }, { "epoch": 59.44317716558461, "grad_norm": 0.25825023651123047, "learning_rate": 0.001, "loss": 1.7879, "step": 1016300 }, { "epoch": 59.44902614493771, "grad_norm": 0.15828166902065277, "learning_rate": 0.001, "loss": 1.7942, "step": 1016400 }, { "epoch": 59.45487512429081, "grad_norm": 0.21820464730262756, "learning_rate": 0.001, "loss": 1.7959, "step": 1016500 }, { "epoch": 59.46072410364391, "grad_norm": 0.25896722078323364, "learning_rate": 0.001, "loss": 1.7924, "step": 1016600 }, { "epoch": 59.466573082997016, "grad_norm": 0.24399703741073608, "learning_rate": 0.001, "loss": 1.7909, "step": 1016700 }, { "epoch": 59.47242206235012, "grad_norm": 0.1703064888715744, "learning_rate": 0.001, "loss": 1.7916, "step": 1016800 }, { "epoch": 59.47827104170322, "grad_norm": 0.15914517641067505, "learning_rate": 0.001, "loss": 1.7922, "step": 1016900 }, { "epoch": 59.48412002105633, "grad_norm": 0.23087847232818604, "learning_rate": 0.001, "loss": 1.7978, "step": 1017000 }, { "epoch": 59.48996900040943, "grad_norm": 0.13433222472667694, "learning_rate": 0.001, "loss": 1.7971, "step": 1017100 }, { "epoch": 59.495817979762535, "grad_norm": 0.21587887406349182, "learning_rate": 0.001, "loss": 1.7892, "step": 1017200 }, { "epoch": 59.50166695911563, "grad_norm": 0.16126199066638947, "learning_rate": 0.001, "loss": 1.7911, "step": 1017300 }, { "epoch": 59.507515938468735, "grad_norm": 0.15127691626548767, "learning_rate": 0.001, "loss": 1.7943, "step": 1017400 }, { "epoch": 59.51336491782184, "grad_norm": 0.17929255962371826, "learning_rate": 0.001, "loss": 1.7924, "step": 1017500 }, { "epoch": 59.51921389717494, "grad_norm": 0.15389002859592438, "learning_rate": 0.001, "loss": 1.7881, "step": 1017600 }, { "epoch": 59.52506287652805, "grad_norm": 0.1484398990869522, "learning_rate": 0.001, "loss": 1.794, "step": 1017700 }, { "epoch": 59.53091185588115, "grad_norm": 0.21504776179790497, "learning_rate": 0.001, "loss": 1.7905, "step": 1017800 }, { "epoch": 59.536760835234254, "grad_norm": 0.21927569806575775, "learning_rate": 0.001, "loss": 1.7932, "step": 1017900 }, { "epoch": 59.54260981458735, "grad_norm": 0.16370737552642822, "learning_rate": 0.001, "loss": 1.7923, "step": 1018000 }, { "epoch": 59.548458793940455, "grad_norm": 0.22037088871002197, "learning_rate": 0.001, "loss": 1.7902, "step": 1018100 }, { "epoch": 59.55430777329356, "grad_norm": 0.21702420711517334, "learning_rate": 0.001, "loss": 1.7965, "step": 1018200 }, { "epoch": 59.56015675264666, "grad_norm": 0.2113087922334671, "learning_rate": 0.001, "loss": 1.7954, "step": 1018300 }, { "epoch": 59.566005731999766, "grad_norm": 0.20329821109771729, "learning_rate": 0.001, "loss": 1.8002, "step": 1018400 }, { "epoch": 59.57185471135287, "grad_norm": 0.20518898963928223, "learning_rate": 0.001, "loss": 1.7939, "step": 1018500 }, { "epoch": 59.577703690705974, "grad_norm": 0.1530771702528, "learning_rate": 0.001, "loss": 1.7969, "step": 1018600 }, { "epoch": 59.58355267005908, "grad_norm": 0.2717495858669281, "learning_rate": 0.001, "loss": 1.7887, "step": 1018700 }, { "epoch": 59.589401649412174, "grad_norm": 0.15871942043304443, "learning_rate": 0.001, "loss": 1.7926, "step": 1018800 }, { "epoch": 59.59525062876528, "grad_norm": 0.19616484642028809, "learning_rate": 0.001, "loss": 1.7955, "step": 1018900 }, { "epoch": 59.60109960811838, "grad_norm": 0.19566306471824646, "learning_rate": 0.001, "loss": 1.789, "step": 1019000 }, { "epoch": 59.606948587471486, "grad_norm": 0.1350986808538437, "learning_rate": 0.001, "loss": 1.7925, "step": 1019100 }, { "epoch": 59.61279756682459, "grad_norm": 0.16357874870300293, "learning_rate": 0.001, "loss": 1.795, "step": 1019200 }, { "epoch": 59.61864654617769, "grad_norm": 0.15373238921165466, "learning_rate": 0.001, "loss": 1.7912, "step": 1019300 }, { "epoch": 59.6244955255308, "grad_norm": 0.18257543444633484, "learning_rate": 0.001, "loss": 1.796, "step": 1019400 }, { "epoch": 59.6303445048839, "grad_norm": 0.15516383945941925, "learning_rate": 0.001, "loss": 1.7925, "step": 1019500 }, { "epoch": 59.636193484237, "grad_norm": 0.16955654323101044, "learning_rate": 0.001, "loss": 1.7942, "step": 1019600 }, { "epoch": 59.6420424635901, "grad_norm": 0.17887911200523376, "learning_rate": 0.001, "loss": 1.7944, "step": 1019700 }, { "epoch": 59.647891442943205, "grad_norm": 0.16687041521072388, "learning_rate": 0.001, "loss": 1.7954, "step": 1019800 }, { "epoch": 59.65374042229631, "grad_norm": 0.15276916325092316, "learning_rate": 0.001, "loss": 1.7906, "step": 1019900 }, { "epoch": 59.65958940164941, "grad_norm": 0.1573338657617569, "learning_rate": 0.001, "loss": 1.7901, "step": 1020000 }, { "epoch": 59.66543838100252, "grad_norm": 0.2000024914741516, "learning_rate": 0.001, "loss": 1.792, "step": 1020100 }, { "epoch": 59.67128736035562, "grad_norm": 0.15654121339321136, "learning_rate": 0.001, "loss": 1.7897, "step": 1020200 }, { "epoch": 59.677136339708724, "grad_norm": 0.16604381799697876, "learning_rate": 0.001, "loss": 1.7925, "step": 1020300 }, { "epoch": 59.68298531906182, "grad_norm": 0.1434793919324875, "learning_rate": 0.001, "loss": 1.79, "step": 1020400 }, { "epoch": 59.688834298414925, "grad_norm": 0.2011117786169052, "learning_rate": 0.001, "loss": 1.7906, "step": 1020500 }, { "epoch": 59.69468327776803, "grad_norm": 0.22452029585838318, "learning_rate": 0.001, "loss": 1.7933, "step": 1020600 }, { "epoch": 59.70053225712113, "grad_norm": 0.1461828202009201, "learning_rate": 0.001, "loss": 1.7891, "step": 1020700 }, { "epoch": 59.706381236474236, "grad_norm": 0.22908560931682587, "learning_rate": 0.001, "loss": 1.7993, "step": 1020800 }, { "epoch": 59.71223021582734, "grad_norm": 0.22903205454349518, "learning_rate": 0.001, "loss": 1.7907, "step": 1020900 }, { "epoch": 59.718079195180444, "grad_norm": 0.19067791104316711, "learning_rate": 0.001, "loss": 1.7928, "step": 1021000 }, { "epoch": 59.72392817453354, "grad_norm": 0.1999502331018448, "learning_rate": 0.001, "loss": 1.7946, "step": 1021100 }, { "epoch": 59.729777153886644, "grad_norm": 0.28643375635147095, "learning_rate": 0.001, "loss": 1.7976, "step": 1021200 }, { "epoch": 59.73562613323975, "grad_norm": 0.15415507555007935, "learning_rate": 0.001, "loss": 1.7945, "step": 1021300 }, { "epoch": 59.74147511259285, "grad_norm": 0.20127984881401062, "learning_rate": 0.001, "loss": 1.7912, "step": 1021400 }, { "epoch": 59.747324091945956, "grad_norm": 0.24557171761989594, "learning_rate": 0.001, "loss": 1.791, "step": 1021500 }, { "epoch": 59.75317307129906, "grad_norm": 0.1442347913980484, "learning_rate": 0.001, "loss": 1.796, "step": 1021600 }, { "epoch": 59.75902205065216, "grad_norm": 0.188969224691391, "learning_rate": 0.001, "loss": 1.7927, "step": 1021700 }, { "epoch": 59.76487103000527, "grad_norm": 0.1438504010438919, "learning_rate": 0.001, "loss": 1.7944, "step": 1021800 }, { "epoch": 59.770720009358364, "grad_norm": 0.16215381026268005, "learning_rate": 0.001, "loss": 1.7907, "step": 1021900 }, { "epoch": 59.77656898871147, "grad_norm": 0.22671417891979218, "learning_rate": 0.001, "loss": 1.7895, "step": 1022000 }, { "epoch": 59.78241796806457, "grad_norm": 0.13378244638442993, "learning_rate": 0.001, "loss": 1.7942, "step": 1022100 }, { "epoch": 59.788266947417675, "grad_norm": 0.22477687895298004, "learning_rate": 0.001, "loss": 1.7914, "step": 1022200 }, { "epoch": 59.79411592677078, "grad_norm": 0.16880853474140167, "learning_rate": 0.001, "loss": 1.7985, "step": 1022300 }, { "epoch": 59.79996490612388, "grad_norm": 0.20755122601985931, "learning_rate": 0.001, "loss": 1.7945, "step": 1022400 }, { "epoch": 59.80581388547699, "grad_norm": 0.24913296103477478, "learning_rate": 0.001, "loss": 1.7916, "step": 1022500 }, { "epoch": 59.81166286483009, "grad_norm": 0.1971634477376938, "learning_rate": 0.001, "loss": 1.797, "step": 1022600 }, { "epoch": 59.81751184418319, "grad_norm": 0.18513187766075134, "learning_rate": 0.001, "loss": 1.7942, "step": 1022700 }, { "epoch": 59.82336082353629, "grad_norm": 0.15974204242229462, "learning_rate": 0.001, "loss": 1.7934, "step": 1022800 }, { "epoch": 59.829209802889395, "grad_norm": 0.19647400081157684, "learning_rate": 0.001, "loss": 1.8008, "step": 1022900 }, { "epoch": 59.8350587822425, "grad_norm": 0.1708020567893982, "learning_rate": 0.001, "loss": 1.7967, "step": 1023000 }, { "epoch": 59.8409077615956, "grad_norm": 0.17096801102161407, "learning_rate": 0.001, "loss": 1.7957, "step": 1023100 }, { "epoch": 59.846756740948706, "grad_norm": 0.14250651001930237, "learning_rate": 0.001, "loss": 1.7933, "step": 1023200 }, { "epoch": 59.85260572030181, "grad_norm": 0.20874600112438202, "learning_rate": 0.001, "loss": 1.7968, "step": 1023300 }, { "epoch": 59.858454699654914, "grad_norm": 0.1253586858510971, "learning_rate": 0.001, "loss": 1.7991, "step": 1023400 }, { "epoch": 59.86430367900801, "grad_norm": 0.15062215924263, "learning_rate": 0.001, "loss": 1.7972, "step": 1023500 }, { "epoch": 59.870152658361114, "grad_norm": 0.1392335295677185, "learning_rate": 0.001, "loss": 1.7926, "step": 1023600 }, { "epoch": 59.87600163771422, "grad_norm": 0.1430715173482895, "learning_rate": 0.001, "loss": 1.79, "step": 1023700 }, { "epoch": 59.88185061706732, "grad_norm": 0.21880097687244415, "learning_rate": 0.001, "loss": 1.7889, "step": 1023800 }, { "epoch": 59.887699596420426, "grad_norm": 0.13424457609653473, "learning_rate": 0.001, "loss": 1.7908, "step": 1023900 }, { "epoch": 59.89354857577353, "grad_norm": 0.24910898506641388, "learning_rate": 0.001, "loss": 1.7943, "step": 1024000 }, { "epoch": 59.89939755512663, "grad_norm": 0.1275789886713028, "learning_rate": 0.001, "loss": 1.7975, "step": 1024100 }, { "epoch": 59.90524653447973, "grad_norm": 0.18317507207393646, "learning_rate": 0.001, "loss": 1.7952, "step": 1024200 }, { "epoch": 59.911095513832834, "grad_norm": 0.18518832325935364, "learning_rate": 0.001, "loss": 1.799, "step": 1024300 }, { "epoch": 59.91694449318594, "grad_norm": 0.16630665957927704, "learning_rate": 0.001, "loss": 1.7919, "step": 1024400 }, { "epoch": 59.92279347253904, "grad_norm": 0.14224685728549957, "learning_rate": 0.001, "loss": 1.7939, "step": 1024500 }, { "epoch": 59.928642451892145, "grad_norm": 0.2228730320930481, "learning_rate": 0.001, "loss": 1.7877, "step": 1024600 }, { "epoch": 59.93449143124525, "grad_norm": 0.14506632089614868, "learning_rate": 0.001, "loss": 1.7997, "step": 1024700 }, { "epoch": 59.94034041059835, "grad_norm": 0.27310171723365784, "learning_rate": 0.001, "loss": 1.7959, "step": 1024800 }, { "epoch": 59.946189389951456, "grad_norm": 0.22022268176078796, "learning_rate": 0.001, "loss": 1.7973, "step": 1024900 }, { "epoch": 59.95203836930455, "grad_norm": 0.16477572917938232, "learning_rate": 0.001, "loss": 1.7913, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_ag_news_accuracy": 0.243484375, "eval_ag_news_bleu_score": 7.208580307949747, "eval_ag_news_bleu_score_sem": 0.4374454648685965, "eval_ag_news_emb_cos_sim": 0.7013816833496094, "eval_ag_news_emb_cos_sim_sem": 0.015993952751159668, "eval_ag_news_emb_top1_equal": 0.9609375, "eval_ag_news_emb_top1_equal_sem": 0.017191974446177483, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.698885679244995, "eval_ag_news_n_ngrams_match_1": 13.8046875, "eval_ag_news_n_ngrams_match_2": 4.171875, "eval_ag_news_n_ngrams_match_3": 1.640625, "eval_ag_news_num_pred_words": 45.2578125, "eval_ag_news_num_true_words": 44.703125, "eval_ag_news_perplexity": 14.863160165713817, "eval_ag_news_pred_num_tokens": 69.7890625, "eval_ag_news_rouge_score": 0.30183570654734987, "eval_ag_news_runtime": 38.6254, "eval_ag_news_samples_per_second": 12.945, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.33559845222280216, "eval_ag_news_token_set_f1_sem": 0.010119128430723323, "eval_ag_news_token_set_precision": 0.3192734750538188, "eval_ag_news_token_set_recall": 0.3632799082984318, "eval_ag_news_true_num_tokens": 63.4296875, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_anthropic_toxic_prompts_accuracy": 0.102828125, "eval_anthropic_toxic_prompts_bleu_score": 45.599585648532724, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.781797739441838, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8873087763786316, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010290504433214664, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.1796875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03406800775380022, "eval_anthropic_toxic_prompts_loss": 1.2183188199996948, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.7890625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.03125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.078125, "eval_anthropic_toxic_prompts_num_pred_words": 15.046875, "eval_anthropic_toxic_prompts_num_true_words": 15.1640625, "eval_anthropic_toxic_prompts_perplexity": 3.381498045381425, "eval_anthropic_toxic_prompts_pred_num_tokens": 19.140625, "eval_anthropic_toxic_prompts_rouge_score": 0.7070432429933111, "eval_anthropic_toxic_prompts_runtime": 29.9403, "eval_anthropic_toxic_prompts_samples_per_second": 16.7, "eval_anthropic_toxic_prompts_steps_per_second": 0.033, "eval_anthropic_toxic_prompts_token_set_f1": 0.7222801299928606, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01937517260406968, "eval_anthropic_toxic_prompts_token_set_precision": 0.7120003053316644, "eval_anthropic_toxic_prompts_token_set_recall": 0.7372287563058192, "eval_anthropic_toxic_prompts_true_num_tokens": 18.6953125, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_arxiv_accuracy": 0.377046875, "eval_arxiv_bleu_score": 2.122497894348904, "eval_arxiv_bleu_score_sem": 0.15320729943988057, "eval_arxiv_emb_cos_sim": 0.5261175632476807, "eval_arxiv_emb_cos_sim_sem": 0.017469581216573715, "eval_arxiv_emb_top1_equal": 0.9765625, "eval_arxiv_emb_top1_equal_sem": 0.013424675911664963, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.422072649002075, "eval_arxiv_n_ngrams_match_1": 16.671875, "eval_arxiv_n_ngrams_match_2": 2.78125, "eval_arxiv_n_ngrams_match_3": 0.5703125, "eval_arxiv_num_pred_words": 62.1953125, "eval_arxiv_num_true_words": 86.8984375, "eval_arxiv_perplexity": 30.632840395151167, "eval_arxiv_pred_num_tokens": 125.375, "eval_arxiv_rouge_score": 0.2098828184208154, "eval_arxiv_runtime": 31.6088, "eval_arxiv_samples_per_second": 15.818, "eval_arxiv_steps_per_second": 0.032, "eval_arxiv_token_set_f1": 0.209539403559485, "eval_arxiv_token_set_f1_sem": 0.0072587624596307096, "eval_arxiv_token_set_precision": 0.14808856912388169, "eval_arxiv_token_set_recall": 0.4033280249856932, "eval_arxiv_true_num_tokens": 125.1328125, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_python_code_alpaca_accuracy": 0.13071875, "eval_python_code_alpaca_bleu_score": 27.52446191332287, "eval_python_code_alpaca_bleu_score_sem": 1.493456668382057, "eval_python_code_alpaca_emb_cos_sim": 0.8734008073806763, "eval_python_code_alpaca_emb_cos_sim_sem": 0.00931334588676691, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4908379316329956, "eval_python_code_alpaca_n_ngrams_match_1": 10.2265625, "eval_python_code_alpaca_n_ngrams_match_2": 5.7421875, "eval_python_code_alpaca_n_ngrams_match_3": 3.234375, "eval_python_code_alpaca_num_pred_words": 17.5234375, "eval_python_code_alpaca_num_true_words": 18.28125, "eval_python_code_alpaca_perplexity": 4.440815059840491, "eval_python_code_alpaca_pred_num_tokens": 23.7734375, "eval_python_code_alpaca_rouge_score": 0.5917935607578115, "eval_python_code_alpaca_runtime": 29.9639, "eval_python_code_alpaca_samples_per_second": 16.687, "eval_python_code_alpaca_steps_per_second": 0.033, "eval_python_code_alpaca_token_set_f1": 0.6167477893752812, "eval_python_code_alpaca_token_set_f1_sem": 0.014697093510443574, "eval_python_code_alpaca_token_set_precision": 0.602482889030924, "eval_python_code_alpaca_token_set_recall": 0.6401744493098345, "eval_python_code_alpaca_true_num_tokens": 24.390625, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_wikibio_accuracy": 0.362265625, "eval_wikibio_bleu_score": 7.237924476387085, "eval_wikibio_bleu_score_sem": 0.5903481845093248, "eval_wikibio_emb_cos_sim": 0.6150421500205994, "eval_wikibio_emb_cos_sim_sem": 0.022159527987241745, "eval_wikibio_emb_top1_equal": 0.953125, "eval_wikibio_emb_top1_equal_sem": 0.01875615119934082, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7065913677215576, "eval_wikibio_n_ngrams_match_1": 16.2734375, "eval_wikibio_n_ngrams_match_2": 5.421875, "eval_wikibio_n_ngrams_match_3": 2.265625, "eval_wikibio_num_pred_words": 55.90625, "eval_wikibio_num_true_words": 53.53125, "eval_wikibio_perplexity": 14.978133452992584, "eval_wikibio_pred_num_tokens": 104.25, "eval_wikibio_rouge_score": 0.30503179499623234, "eval_wikibio_runtime": 31.2535, "eval_wikibio_samples_per_second": 15.998, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32724553078754925, "eval_wikibio_token_set_f1_sem": 0.010553738484888027, "eval_wikibio_token_set_precision": 0.2924794476636203, "eval_wikibio_token_set_recall": 0.4004861868720145, "eval_wikibio_true_num_tokens": 103.171875, "step": 1025000 }, { "epoch": 59.95203836930455, "eval_msmarco_accuracy": 0.39809375, "eval_msmarco_bleu_score": 18.17448251921011, "eval_msmarco_bleu_score_sem": 1.562172649514596, "eval_msmarco_emb_cos_sim": 0.8060867786407471, "eval_msmarco_emb_cos_sim_sem": 0.014887221157550812, "eval_msmarco_emb_top1_equal": 0.9609375, "eval_msmarco_emb_top1_equal_sem": 0.017191974446177483, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7207189798355103, "eval_msmarco_n_ngrams_match_1": 29.1640625, "eval_msmarco_n_ngrams_match_2": 13.75, "eval_msmarco_n_ngrams_match_3": 8.0234375, "eval_msmarco_num_pred_words": 60.7890625, "eval_msmarco_num_true_words": 61.3984375, "eval_msmarco_perplexity": 5.5885450713894365, "eval_msmarco_pred_num_tokens": 82.5859375, "eval_msmarco_rouge_score": 0.45147681306756837, "eval_msmarco_runtime": 26.8171, "eval_msmarco_samples_per_second": 18.645, "eval_msmarco_steps_per_second": 0.037, "eval_msmarco_token_set_f1": 0.4810011809930661, "eval_msmarco_token_set_f1_sem": 0.014663981949926493, "eval_msmarco_token_set_precision": 0.44652481101740044, "eval_msmarco_token_set_recall": 0.5358035180425048, "eval_msmarco_true_num_tokens": 79.6796875, "step": 1025000 }, { "epoch": 59.95788734865766, "grad_norm": 0.1438513994216919, "learning_rate": 0.001, "loss": 1.7932, "step": 1025100 }, { "epoch": 59.96373632801076, "grad_norm": 0.15053404867649078, "learning_rate": 0.001, "loss": 1.7925, "step": 1025200 }, { "epoch": 59.969585307363865, "grad_norm": 0.17841222882270813, "learning_rate": 0.001, "loss": 1.797, "step": 1025300 }, { "epoch": 59.97543428671697, "grad_norm": 0.17908257246017456, "learning_rate": 0.001, "loss": 1.7962, "step": 1025400 }, { "epoch": 59.98128326607007, "grad_norm": 0.2581585645675659, "learning_rate": 0.001, "loss": 1.7993, "step": 1025500 }, { "epoch": 59.987132245423176, "grad_norm": 0.19206148386001587, "learning_rate": 0.001, "loss": 1.7997, "step": 1025600 }, { "epoch": 59.99298122477628, "grad_norm": 0.20373176038265228, "learning_rate": 0.001, "loss": 1.7898, "step": 1025700 }, { "epoch": 59.998830204129376, "grad_norm": 0.1808301955461502, "learning_rate": 0.001, "loss": 1.7938, "step": 1025800 }, { "epoch": 60.00467918348248, "grad_norm": 0.15678322315216064, "learning_rate": 0.001, "loss": 1.7944, "step": 1025900 }, { "epoch": 60.010528162835584, "grad_norm": 0.18355704843997955, "learning_rate": 0.001, "loss": 1.7787, "step": 1026000 }, { "epoch": 60.01637714218869, "grad_norm": 0.16840824484825134, "learning_rate": 0.001, "loss": 1.7785, "step": 1026100 }, { "epoch": 60.02222612154179, "grad_norm": 0.21366392076015472, "learning_rate": 0.001, "loss": 1.7879, "step": 1026200 }, { "epoch": 60.028075100894895, "grad_norm": 0.16314567625522614, "learning_rate": 0.001, "loss": 1.7767, "step": 1026300 }, { "epoch": 60.033924080248, "grad_norm": 0.1934056431055069, "learning_rate": 0.001, "loss": 1.7818, "step": 1026400 }, { "epoch": 60.0397730596011, "grad_norm": 0.1595102697610855, "learning_rate": 0.001, "loss": 1.779, "step": 1026500 }, { "epoch": 60.0456220389542, "grad_norm": 0.1971622109413147, "learning_rate": 0.001, "loss": 1.788, "step": 1026600 }, { "epoch": 60.0514710183073, "grad_norm": 0.18463687598705292, "learning_rate": 0.001, "loss": 1.7747, "step": 1026700 }, { "epoch": 60.05731999766041, "grad_norm": 0.20984643697738647, "learning_rate": 0.001, "loss": 1.7847, "step": 1026800 }, { "epoch": 60.06316897701351, "grad_norm": 0.1372976303100586, "learning_rate": 0.001, "loss": 1.7897, "step": 1026900 }, { "epoch": 60.069017956366615, "grad_norm": 0.16567492485046387, "learning_rate": 0.001, "loss": 1.7868, "step": 1027000 }, { "epoch": 60.07486693571972, "grad_norm": 0.18888939917087555, "learning_rate": 0.001, "loss": 1.783, "step": 1027100 }, { "epoch": 60.08071591507282, "grad_norm": 0.16699658334255219, "learning_rate": 0.001, "loss": 1.7817, "step": 1027200 }, { "epoch": 60.08656489442592, "grad_norm": 0.1840934306383133, "learning_rate": 0.001, "loss": 1.7801, "step": 1027300 }, { "epoch": 60.09241387377902, "grad_norm": 0.21444091200828552, "learning_rate": 0.001, "loss": 1.7885, "step": 1027400 }, { "epoch": 60.09826285313213, "grad_norm": 0.18830232322216034, "learning_rate": 0.001, "loss": 1.7859, "step": 1027500 }, { "epoch": 60.10411183248523, "grad_norm": 0.1509484499692917, "learning_rate": 0.001, "loss": 1.785, "step": 1027600 }, { "epoch": 60.109960811838334, "grad_norm": 0.20758092403411865, "learning_rate": 0.001, "loss": 1.7823, "step": 1027700 }, { "epoch": 60.11580979119144, "grad_norm": 0.19146552681922913, "learning_rate": 0.001, "loss": 1.783, "step": 1027800 }, { "epoch": 60.12165877054454, "grad_norm": 0.161995992064476, "learning_rate": 0.001, "loss": 1.7853, "step": 1027900 }, { "epoch": 60.127507749897646, "grad_norm": 0.1434583216905594, "learning_rate": 0.001, "loss": 1.7824, "step": 1028000 }, { "epoch": 60.13335672925074, "grad_norm": 0.2132447361946106, "learning_rate": 0.001, "loss": 1.788, "step": 1028100 }, { "epoch": 60.139205708603846, "grad_norm": 0.18431724607944489, "learning_rate": 0.001, "loss": 1.7909, "step": 1028200 }, { "epoch": 60.14505468795695, "grad_norm": 0.23792153596878052, "learning_rate": 0.001, "loss": 1.7883, "step": 1028300 }, { "epoch": 60.150903667310054, "grad_norm": 0.16733777523040771, "learning_rate": 0.001, "loss": 1.7884, "step": 1028400 }, { "epoch": 60.15675264666316, "grad_norm": 0.1743660718202591, "learning_rate": 0.001, "loss": 1.7811, "step": 1028500 }, { "epoch": 60.16260162601626, "grad_norm": 0.22821402549743652, "learning_rate": 0.001, "loss": 1.7861, "step": 1028600 }, { "epoch": 60.168450605369365, "grad_norm": 0.21498611569404602, "learning_rate": 0.001, "loss": 1.7897, "step": 1028700 }, { "epoch": 60.17429958472247, "grad_norm": 0.2564675807952881, "learning_rate": 0.001, "loss": 1.7878, "step": 1028800 }, { "epoch": 60.180148564075566, "grad_norm": 0.13882267475128174, "learning_rate": 0.001, "loss": 1.7835, "step": 1028900 }, { "epoch": 60.18599754342867, "grad_norm": 0.18324673175811768, "learning_rate": 0.001, "loss": 1.783, "step": 1029000 }, { "epoch": 60.19184652278177, "grad_norm": 0.19947467744350433, "learning_rate": 0.001, "loss": 1.7852, "step": 1029100 }, { "epoch": 60.19769550213488, "grad_norm": 0.20585514605045319, "learning_rate": 0.001, "loss": 1.7849, "step": 1029200 }, { "epoch": 60.20354448148798, "grad_norm": 0.29886120557785034, "learning_rate": 0.001, "loss": 1.7873, "step": 1029300 }, { "epoch": 60.209393460841085, "grad_norm": 0.16444459557533264, "learning_rate": 0.001, "loss": 1.7794, "step": 1029400 }, { "epoch": 60.21524244019419, "grad_norm": 0.27110135555267334, "learning_rate": 0.001, "loss": 1.7839, "step": 1029500 }, { "epoch": 60.22109141954729, "grad_norm": 0.2245672196149826, "learning_rate": 0.001, "loss": 1.7921, "step": 1029600 }, { "epoch": 60.22694039890039, "grad_norm": 0.20709174871444702, "learning_rate": 0.001, "loss": 1.7862, "step": 1029700 }, { "epoch": 60.23278937825349, "grad_norm": 0.16573403775691986, "learning_rate": 0.001, "loss": 1.7834, "step": 1029800 }, { "epoch": 60.2386383576066, "grad_norm": 0.14195190370082855, "learning_rate": 0.001, "loss": 1.7858, "step": 1029900 }, { "epoch": 60.2444873369597, "grad_norm": 0.18038326501846313, "learning_rate": 0.001, "loss": 1.7859, "step": 1030000 }, { "epoch": 60.250336316312804, "grad_norm": 0.15306495130062103, "learning_rate": 0.001, "loss": 1.7841, "step": 1030100 }, { "epoch": 60.25618529566591, "grad_norm": 0.15824078023433685, "learning_rate": 0.001, "loss": 1.7879, "step": 1030200 }, { "epoch": 60.26203427501901, "grad_norm": 0.18723122775554657, "learning_rate": 0.001, "loss": 1.788, "step": 1030300 }, { "epoch": 60.26788325437211, "grad_norm": 0.16254480183124542, "learning_rate": 0.001, "loss": 1.7923, "step": 1030400 }, { "epoch": 60.27373223372521, "grad_norm": 0.2207721769809723, "learning_rate": 0.001, "loss": 1.786, "step": 1030500 }, { "epoch": 60.279581213078316, "grad_norm": 0.19396814703941345, "learning_rate": 0.001, "loss": 1.7901, "step": 1030600 }, { "epoch": 60.28543019243142, "grad_norm": 0.1589633971452713, "learning_rate": 0.001, "loss": 1.7894, "step": 1030700 }, { "epoch": 60.291279171784524, "grad_norm": 0.1926635056734085, "learning_rate": 0.001, "loss": 1.7909, "step": 1030800 }, { "epoch": 60.29712815113763, "grad_norm": 0.16888687014579773, "learning_rate": 0.001, "loss": 1.7808, "step": 1030900 }, { "epoch": 60.30297713049073, "grad_norm": 0.23312602937221527, "learning_rate": 0.001, "loss": 1.7867, "step": 1031000 }, { "epoch": 60.308826109843835, "grad_norm": 0.14564450085163116, "learning_rate": 0.001, "loss": 1.7935, "step": 1031100 }, { "epoch": 60.31467508919693, "grad_norm": 0.1691483110189438, "learning_rate": 0.001, "loss": 1.7833, "step": 1031200 }, { "epoch": 60.320524068550036, "grad_norm": 0.18070825934410095, "learning_rate": 0.001, "loss": 1.7887, "step": 1031300 }, { "epoch": 60.32637304790314, "grad_norm": 0.14143916964530945, "learning_rate": 0.001, "loss": 1.7858, "step": 1031400 }, { "epoch": 60.33222202725624, "grad_norm": 0.22633720934391022, "learning_rate": 0.001, "loss": 1.782, "step": 1031500 }, { "epoch": 60.33807100660935, "grad_norm": 0.1709565967321396, "learning_rate": 0.001, "loss": 1.7901, "step": 1031600 }, { "epoch": 60.34391998596245, "grad_norm": 0.14733979105949402, "learning_rate": 0.001, "loss": 1.7901, "step": 1031700 }, { "epoch": 60.349768965315555, "grad_norm": 0.2037566751241684, "learning_rate": 0.001, "loss": 1.7946, "step": 1031800 }, { "epoch": 60.35561794466866, "grad_norm": 0.2298807054758072, "learning_rate": 0.001, "loss": 1.7841, "step": 1031900 }, { "epoch": 60.361466924021755, "grad_norm": 0.1829633265733719, "learning_rate": 0.001, "loss": 1.7944, "step": 1032000 }, { "epoch": 60.36731590337486, "grad_norm": 0.18782249093055725, "learning_rate": 0.001, "loss": 1.7898, "step": 1032100 }, { "epoch": 60.37316488272796, "grad_norm": 0.20858187973499298, "learning_rate": 0.001, "loss": 1.7851, "step": 1032200 }, { "epoch": 60.37901386208107, "grad_norm": 0.19103118777275085, "learning_rate": 0.001, "loss": 1.7869, "step": 1032300 }, { "epoch": 60.38486284143417, "grad_norm": 0.2175997644662857, "learning_rate": 0.001, "loss": 1.7887, "step": 1032400 }, { "epoch": 60.390711820787274, "grad_norm": 0.15570004284381866, "learning_rate": 0.001, "loss": 1.7895, "step": 1032500 }, { "epoch": 60.39656080014038, "grad_norm": 0.16573934257030487, "learning_rate": 0.001, "loss": 1.7889, "step": 1032600 }, { "epoch": 60.40240977949348, "grad_norm": 0.14889563620090485, "learning_rate": 0.001, "loss": 1.7821, "step": 1032700 }, { "epoch": 60.40825875884658, "grad_norm": 0.16061334311962128, "learning_rate": 0.001, "loss": 1.7951, "step": 1032800 }, { "epoch": 60.41410773819968, "grad_norm": 0.17927053570747375, "learning_rate": 0.001, "loss": 1.7896, "step": 1032900 }, { "epoch": 60.419956717552786, "grad_norm": 0.16667310893535614, "learning_rate": 0.001, "loss": 1.7859, "step": 1033000 }, { "epoch": 60.42580569690589, "grad_norm": 0.19241446256637573, "learning_rate": 0.001, "loss": 1.7963, "step": 1033100 }, { "epoch": 60.431654676258994, "grad_norm": 0.17965167760849, "learning_rate": 0.001, "loss": 1.7868, "step": 1033200 }, { "epoch": 60.4375036556121, "grad_norm": 0.1972946673631668, "learning_rate": 0.001, "loss": 1.7873, "step": 1033300 }, { "epoch": 60.4433526349652, "grad_norm": 0.2086273729801178, "learning_rate": 0.001, "loss": 1.7881, "step": 1033400 }, { "epoch": 60.4492016143183, "grad_norm": 0.21290206909179688, "learning_rate": 0.001, "loss": 1.7924, "step": 1033500 }, { "epoch": 60.4550505936714, "grad_norm": 0.20829936861991882, "learning_rate": 0.001, "loss": 1.7965, "step": 1033600 }, { "epoch": 60.460899573024506, "grad_norm": 0.2046913057565689, "learning_rate": 0.001, "loss": 1.7907, "step": 1033700 }, { "epoch": 60.46674855237761, "grad_norm": 0.19433645904064178, "learning_rate": 0.001, "loss": 1.7919, "step": 1033800 }, { "epoch": 60.47259753173071, "grad_norm": 0.24087879061698914, "learning_rate": 0.001, "loss": 1.7862, "step": 1033900 }, { "epoch": 60.47844651108382, "grad_norm": 0.16692443192005157, "learning_rate": 0.001, "loss": 1.7923, "step": 1034000 }, { "epoch": 60.48429549043692, "grad_norm": 0.15973635017871857, "learning_rate": 0.001, "loss": 1.7868, "step": 1034100 }, { "epoch": 60.490144469790025, "grad_norm": 0.15782153606414795, "learning_rate": 0.001, "loss": 1.7894, "step": 1034200 }, { "epoch": 60.49599344914312, "grad_norm": 0.16063056886196136, "learning_rate": 0.001, "loss": 1.784, "step": 1034300 }, { "epoch": 60.501842428496225, "grad_norm": 0.195884570479393, "learning_rate": 0.001, "loss": 1.7881, "step": 1034400 }, { "epoch": 60.50769140784933, "grad_norm": 0.1930210143327713, "learning_rate": 0.001, "loss": 1.7911, "step": 1034500 }, { "epoch": 60.51354038720243, "grad_norm": 0.1654813587665558, "learning_rate": 0.001, "loss": 1.7946, "step": 1034600 }, { "epoch": 60.51938936655554, "grad_norm": 0.16308480501174927, "learning_rate": 0.001, "loss": 1.7921, "step": 1034700 }, { "epoch": 60.52523834590864, "grad_norm": 0.1575622409582138, "learning_rate": 0.001, "loss": 1.7929, "step": 1034800 }, { "epoch": 60.531087325261744, "grad_norm": 0.1831761747598648, "learning_rate": 0.001, "loss": 1.7865, "step": 1034900 }, { "epoch": 60.53693630461485, "grad_norm": 0.1980128437280655, "learning_rate": 0.001, "loss": 1.7906, "step": 1035000 }, { "epoch": 60.542785283967945, "grad_norm": 0.16555583477020264, "learning_rate": 0.001, "loss": 1.7946, "step": 1035100 }, { "epoch": 60.54863426332105, "grad_norm": 0.1940310001373291, "learning_rate": 0.001, "loss": 1.7939, "step": 1035200 }, { "epoch": 60.55448324267415, "grad_norm": 0.20619072020053864, "learning_rate": 0.001, "loss": 1.7896, "step": 1035300 }, { "epoch": 60.560332222027256, "grad_norm": 0.1506974846124649, "learning_rate": 0.001, "loss": 1.7877, "step": 1035400 }, { "epoch": 60.56618120138036, "grad_norm": 0.18579286336898804, "learning_rate": 0.001, "loss": 1.7869, "step": 1035500 }, { "epoch": 60.572030180733464, "grad_norm": 0.1651705652475357, "learning_rate": 0.001, "loss": 1.7859, "step": 1035600 }, { "epoch": 60.57787916008657, "grad_norm": 0.1659947782754898, "learning_rate": 0.001, "loss": 1.7873, "step": 1035700 }, { "epoch": 60.58372813943967, "grad_norm": 0.20349743962287903, "learning_rate": 0.001, "loss": 1.7933, "step": 1035800 }, { "epoch": 60.58957711879277, "grad_norm": 0.1714601069688797, "learning_rate": 0.001, "loss": 1.7893, "step": 1035900 }, { "epoch": 60.59542609814587, "grad_norm": 0.1940300315618515, "learning_rate": 0.001, "loss": 1.7895, "step": 1036000 }, { "epoch": 60.601275077498975, "grad_norm": 0.3247120976448059, "learning_rate": 0.001, "loss": 1.7929, "step": 1036100 }, { "epoch": 60.60712405685208, "grad_norm": 0.20579054951667786, "learning_rate": 0.001, "loss": 1.7919, "step": 1036200 }, { "epoch": 60.61297303620518, "grad_norm": 0.23893961310386658, "learning_rate": 0.001, "loss": 1.7945, "step": 1036300 }, { "epoch": 60.61882201555829, "grad_norm": 0.149692103266716, "learning_rate": 0.001, "loss": 1.7908, "step": 1036400 }, { "epoch": 60.62467099491139, "grad_norm": 0.16767601668834686, "learning_rate": 0.001, "loss": 1.7901, "step": 1036500 }, { "epoch": 60.63051997426449, "grad_norm": 0.19700683653354645, "learning_rate": 0.001, "loss": 1.7897, "step": 1036600 }, { "epoch": 60.63636895361759, "grad_norm": 0.26001352071762085, "learning_rate": 0.001, "loss": 1.7881, "step": 1036700 }, { "epoch": 60.642217932970695, "grad_norm": 0.20685434341430664, "learning_rate": 0.001, "loss": 1.7922, "step": 1036800 }, { "epoch": 60.6480669123238, "grad_norm": 0.1948578953742981, "learning_rate": 0.001, "loss": 1.7904, "step": 1036900 }, { "epoch": 60.6539158916769, "grad_norm": 0.18770942091941833, "learning_rate": 0.001, "loss": 1.7986, "step": 1037000 }, { "epoch": 60.659764871030006, "grad_norm": 0.1486603170633316, "learning_rate": 0.001, "loss": 1.791, "step": 1037100 }, { "epoch": 60.66561385038311, "grad_norm": 0.1787182092666626, "learning_rate": 0.001, "loss": 1.794, "step": 1037200 }, { "epoch": 60.671462829736214, "grad_norm": 0.15008439123630524, "learning_rate": 0.001, "loss": 1.7934, "step": 1037300 }, { "epoch": 60.67731180908931, "grad_norm": 0.19169984757900238, "learning_rate": 0.001, "loss": 1.7947, "step": 1037400 }, { "epoch": 60.683160788442414, "grad_norm": 0.2794022858142853, "learning_rate": 0.001, "loss": 1.799, "step": 1037500 }, { "epoch": 60.68900976779552, "grad_norm": 0.1615874171257019, "learning_rate": 0.001, "loss": 1.7931, "step": 1037600 }, { "epoch": 60.69485874714862, "grad_norm": 0.18251879513263702, "learning_rate": 0.001, "loss": 1.7914, "step": 1037700 }, { "epoch": 60.700707726501726, "grad_norm": 0.15317943692207336, "learning_rate": 0.001, "loss": 1.7885, "step": 1037800 }, { "epoch": 60.70655670585483, "grad_norm": 0.19623029232025146, "learning_rate": 0.001, "loss": 1.7891, "step": 1037900 }, { "epoch": 60.71240568520793, "grad_norm": 0.15883848071098328, "learning_rate": 0.001, "loss": 1.7892, "step": 1038000 }, { "epoch": 60.71825466456104, "grad_norm": 0.166472390294075, "learning_rate": 0.001, "loss": 1.7889, "step": 1038100 }, { "epoch": 60.724103643914134, "grad_norm": 0.17164082825183868, "learning_rate": 0.001, "loss": 1.7939, "step": 1038200 }, { "epoch": 60.72995262326724, "grad_norm": 0.1962481439113617, "learning_rate": 0.001, "loss": 1.7963, "step": 1038300 }, { "epoch": 60.73580160262034, "grad_norm": 0.18026471138000488, "learning_rate": 0.001, "loss": 1.7927, "step": 1038400 }, { "epoch": 60.741650581973445, "grad_norm": 0.17746701836585999, "learning_rate": 0.001, "loss": 1.7956, "step": 1038500 }, { "epoch": 60.74749956132655, "grad_norm": 0.1749163717031479, "learning_rate": 0.001, "loss": 1.7923, "step": 1038600 }, { "epoch": 60.75334854067965, "grad_norm": 0.16496556997299194, "learning_rate": 0.001, "loss": 1.79, "step": 1038700 }, { "epoch": 60.75919752003276, "grad_norm": 0.23624666035175323, "learning_rate": 0.001, "loss": 1.7894, "step": 1038800 }, { "epoch": 60.76504649938586, "grad_norm": 0.17582088708877563, "learning_rate": 0.001, "loss": 1.7916, "step": 1038900 }, { "epoch": 60.77089547873896, "grad_norm": 0.14868412911891937, "learning_rate": 0.001, "loss": 1.7909, "step": 1039000 }, { "epoch": 60.77674445809206, "grad_norm": 0.1842622309923172, "learning_rate": 0.001, "loss": 1.7889, "step": 1039100 }, { "epoch": 60.782593437445165, "grad_norm": 0.19292335212230682, "learning_rate": 0.001, "loss": 1.797, "step": 1039200 }, { "epoch": 60.78844241679827, "grad_norm": 0.24689340591430664, "learning_rate": 0.001, "loss": 1.7906, "step": 1039300 }, { "epoch": 60.79429139615137, "grad_norm": 0.1445535123348236, "learning_rate": 0.001, "loss": 1.7935, "step": 1039400 }, { "epoch": 60.800140375504476, "grad_norm": 0.19584333896636963, "learning_rate": 0.001, "loss": 1.7893, "step": 1039500 }, { "epoch": 60.80598935485758, "grad_norm": 0.1873491257429123, "learning_rate": 0.001, "loss": 1.7896, "step": 1039600 }, { "epoch": 60.81183833421068, "grad_norm": 0.17070016264915466, "learning_rate": 0.001, "loss": 1.7907, "step": 1039700 }, { "epoch": 60.81768731356378, "grad_norm": 0.1848248839378357, "learning_rate": 0.001, "loss": 1.7954, "step": 1039800 }, { "epoch": 60.823536292916884, "grad_norm": 0.14565159380435944, "learning_rate": 0.001, "loss": 1.7899, "step": 1039900 }, { "epoch": 60.82938527226999, "grad_norm": 0.1496947705745697, "learning_rate": 0.001, "loss": 1.7917, "step": 1040000 }, { "epoch": 60.83523425162309, "grad_norm": 0.21340838074684143, "learning_rate": 0.001, "loss": 1.794, "step": 1040100 }, { "epoch": 60.841083230976196, "grad_norm": 0.15369756519794464, "learning_rate": 0.001, "loss": 1.789, "step": 1040200 }, { "epoch": 60.8469322103293, "grad_norm": 0.1693764626979828, "learning_rate": 0.001, "loss": 1.7953, "step": 1040300 }, { "epoch": 60.8527811896824, "grad_norm": 0.15974734723567963, "learning_rate": 0.001, "loss": 1.7915, "step": 1040400 }, { "epoch": 60.8586301690355, "grad_norm": 0.18321140110492706, "learning_rate": 0.001, "loss": 1.7956, "step": 1040500 }, { "epoch": 60.864479148388604, "grad_norm": 0.17346110939979553, "learning_rate": 0.001, "loss": 1.789, "step": 1040600 }, { "epoch": 60.87032812774171, "grad_norm": 0.16028951108455658, "learning_rate": 0.001, "loss": 1.7892, "step": 1040700 }, { "epoch": 60.87617710709481, "grad_norm": 0.17942927777767181, "learning_rate": 0.001, "loss": 1.79, "step": 1040800 }, { "epoch": 60.882026086447915, "grad_norm": 0.22869308292865753, "learning_rate": 0.001, "loss": 1.7924, "step": 1040900 }, { "epoch": 60.88787506580102, "grad_norm": 0.13880114257335663, "learning_rate": 0.001, "loss": 1.7891, "step": 1041000 }, { "epoch": 60.89372404515412, "grad_norm": 0.23091673851013184, "learning_rate": 0.001, "loss": 1.7904, "step": 1041100 }, { "epoch": 60.89957302450723, "grad_norm": 0.17309731245040894, "learning_rate": 0.001, "loss": 1.7924, "step": 1041200 }, { "epoch": 60.90542200386032, "grad_norm": 0.16720062494277954, "learning_rate": 0.001, "loss": 1.7887, "step": 1041300 }, { "epoch": 60.91127098321343, "grad_norm": 0.22983160614967346, "learning_rate": 0.001, "loss": 1.7953, "step": 1041400 }, { "epoch": 60.91711996256653, "grad_norm": 0.15786214172840118, "learning_rate": 0.001, "loss": 1.8024, "step": 1041500 }, { "epoch": 60.922968941919635, "grad_norm": 0.19002500176429749, "learning_rate": 0.001, "loss": 1.796, "step": 1041600 }, { "epoch": 60.92881792127274, "grad_norm": 0.20241771638393402, "learning_rate": 0.001, "loss": 1.7964, "step": 1041700 }, { "epoch": 60.93466690062584, "grad_norm": 0.1991070955991745, "learning_rate": 0.001, "loss": 1.791, "step": 1041800 }, { "epoch": 60.940515879978946, "grad_norm": 0.16278521716594696, "learning_rate": 0.001, "loss": 1.7908, "step": 1041900 }, { "epoch": 60.94636485933205, "grad_norm": 0.20788051187992096, "learning_rate": 0.001, "loss": 1.792, "step": 1042000 }, { "epoch": 60.95221383868515, "grad_norm": 0.15597963333129883, "learning_rate": 0.001, "loss": 1.7887, "step": 1042100 }, { "epoch": 60.95806281803825, "grad_norm": 0.1629766970872879, "learning_rate": 0.001, "loss": 1.7922, "step": 1042200 }, { "epoch": 60.963911797391354, "grad_norm": 0.17499686777591705, "learning_rate": 0.001, "loss": 1.794, "step": 1042300 }, { "epoch": 60.96976077674446, "grad_norm": 0.15742340683937073, "learning_rate": 0.001, "loss": 1.7966, "step": 1042400 }, { "epoch": 60.97560975609756, "grad_norm": 0.14395473897457123, "learning_rate": 0.001, "loss": 1.7916, "step": 1042500 }, { "epoch": 60.981458735450666, "grad_norm": 0.18258421123027802, "learning_rate": 0.001, "loss": 1.7915, "step": 1042600 }, { "epoch": 60.98730771480377, "grad_norm": 0.20718824863433838, "learning_rate": 0.001, "loss": 1.7941, "step": 1042700 }, { "epoch": 60.993156694156866, "grad_norm": 0.21750140190124512, "learning_rate": 0.001, "loss": 1.7949, "step": 1042800 }, { "epoch": 60.99900567350997, "grad_norm": 0.25547873973846436, "learning_rate": 0.001, "loss": 1.7931, "step": 1042900 }, { "epoch": 61.004854652863074, "grad_norm": 0.14570628106594086, "learning_rate": 0.001, "loss": 1.7802, "step": 1043000 }, { "epoch": 61.01070363221618, "grad_norm": 0.18136633932590485, "learning_rate": 0.001, "loss": 1.7806, "step": 1043100 }, { "epoch": 61.01655261156928, "grad_norm": 0.16012202203273773, "learning_rate": 0.001, "loss": 1.7799, "step": 1043200 }, { "epoch": 61.022401590922385, "grad_norm": 0.13786515593528748, "learning_rate": 0.001, "loss": 1.7847, "step": 1043300 }, { "epoch": 61.02825057027549, "grad_norm": 0.15798887610435486, "learning_rate": 0.001, "loss": 1.7825, "step": 1043400 }, { "epoch": 61.03409954962859, "grad_norm": 0.21326321363449097, "learning_rate": 0.001, "loss": 1.7773, "step": 1043500 }, { "epoch": 61.03994852898169, "grad_norm": 0.20661313831806183, "learning_rate": 0.001, "loss": 1.7803, "step": 1043600 }, { "epoch": 61.04579750833479, "grad_norm": 0.17069974541664124, "learning_rate": 0.001, "loss": 1.7778, "step": 1043700 }, { "epoch": 61.0516464876879, "grad_norm": 0.18697769939899445, "learning_rate": 0.001, "loss": 1.7819, "step": 1043800 }, { "epoch": 61.057495467041, "grad_norm": 0.2060280591249466, "learning_rate": 0.001, "loss": 1.7841, "step": 1043900 }, { "epoch": 61.063344446394105, "grad_norm": 0.1638539433479309, "learning_rate": 0.001, "loss": 1.7816, "step": 1044000 }, { "epoch": 61.06919342574721, "grad_norm": 0.16799035668373108, "learning_rate": 0.001, "loss": 1.7804, "step": 1044100 }, { "epoch": 61.07504240510031, "grad_norm": 0.16265389323234558, "learning_rate": 0.001, "loss": 1.7825, "step": 1044200 }, { "epoch": 61.080891384453416, "grad_norm": 0.20008091628551483, "learning_rate": 0.001, "loss": 1.7778, "step": 1044300 }, { "epoch": 61.08674036380651, "grad_norm": 0.18710952997207642, "learning_rate": 0.001, "loss": 1.7823, "step": 1044400 }, { "epoch": 61.09258934315962, "grad_norm": 0.21353425085544586, "learning_rate": 0.001, "loss": 1.7833, "step": 1044500 }, { "epoch": 61.09843832251272, "grad_norm": 0.1631237119436264, "learning_rate": 0.001, "loss": 1.7819, "step": 1044600 }, { "epoch": 61.104287301865824, "grad_norm": 0.17791467905044556, "learning_rate": 0.001, "loss": 1.786, "step": 1044700 }, { "epoch": 61.11013628121893, "grad_norm": 0.19051390886306763, "learning_rate": 0.001, "loss": 1.7888, "step": 1044800 }, { "epoch": 61.11598526057203, "grad_norm": 0.21823839843273163, "learning_rate": 0.001, "loss": 1.7885, "step": 1044900 }, { "epoch": 61.121834239925136, "grad_norm": 0.1625642329454422, "learning_rate": 0.001, "loss": 1.785, "step": 1045000 }, { "epoch": 61.12768321927824, "grad_norm": 0.17044787108898163, "learning_rate": 0.001, "loss": 1.7838, "step": 1045100 }, { "epoch": 61.133532198631336, "grad_norm": 0.17646360397338867, "learning_rate": 0.001, "loss": 1.7822, "step": 1045200 }, { "epoch": 61.13938117798444, "grad_norm": 0.19126224517822266, "learning_rate": 0.001, "loss": 1.7797, "step": 1045300 }, { "epoch": 61.145230157337544, "grad_norm": 0.17087656259536743, "learning_rate": 0.001, "loss": 1.7886, "step": 1045400 }, { "epoch": 61.15107913669065, "grad_norm": 0.17650938034057617, "learning_rate": 0.001, "loss": 1.7874, "step": 1045500 }, { "epoch": 61.15692811604375, "grad_norm": 0.17543736100196838, "learning_rate": 0.001, "loss": 1.78, "step": 1045600 }, { "epoch": 61.162777095396855, "grad_norm": 0.17858198285102844, "learning_rate": 0.001, "loss": 1.7845, "step": 1045700 }, { "epoch": 61.16862607474996, "grad_norm": 0.16995134949684143, "learning_rate": 0.001, "loss": 1.785, "step": 1045800 }, { "epoch": 61.174475054103056, "grad_norm": 0.3635355532169342, "learning_rate": 0.001, "loss": 1.7788, "step": 1045900 }, { "epoch": 61.18032403345616, "grad_norm": 0.1625664234161377, "learning_rate": 0.001, "loss": 1.7849, "step": 1046000 }, { "epoch": 61.18617301280926, "grad_norm": 0.17092588543891907, "learning_rate": 0.001, "loss": 1.7797, "step": 1046100 }, { "epoch": 61.19202199216237, "grad_norm": 0.16884998977184296, "learning_rate": 0.001, "loss": 1.7802, "step": 1046200 }, { "epoch": 61.19787097151547, "grad_norm": 0.1972898542881012, "learning_rate": 0.001, "loss": 1.7821, "step": 1046300 }, { "epoch": 61.203719950868575, "grad_norm": 0.15420609712600708, "learning_rate": 0.001, "loss": 1.7858, "step": 1046400 }, { "epoch": 61.20956893022168, "grad_norm": 0.22000430524349213, "learning_rate": 0.001, "loss": 1.7864, "step": 1046500 }, { "epoch": 61.21541790957478, "grad_norm": 0.17217783629894257, "learning_rate": 0.001, "loss": 1.7817, "step": 1046600 }, { "epoch": 61.22126688892788, "grad_norm": 0.14973050355911255, "learning_rate": 0.001, "loss": 1.7886, "step": 1046700 }, { "epoch": 61.22711586828098, "grad_norm": 0.22106006741523743, "learning_rate": 0.001, "loss": 1.7831, "step": 1046800 }, { "epoch": 61.232964847634086, "grad_norm": 0.20664964616298676, "learning_rate": 0.001, "loss": 1.7857, "step": 1046900 }, { "epoch": 61.23881382698719, "grad_norm": 0.20567072927951813, "learning_rate": 0.001, "loss": 1.7878, "step": 1047000 }, { "epoch": 61.244662806340294, "grad_norm": 0.1727057844400406, "learning_rate": 0.001, "loss": 1.7847, "step": 1047100 }, { "epoch": 61.2505117856934, "grad_norm": 0.33784177899360657, "learning_rate": 0.001, "loss": 1.7831, "step": 1047200 }, { "epoch": 61.2563607650465, "grad_norm": 0.1962907463312149, "learning_rate": 0.001, "loss": 1.7873, "step": 1047300 }, { "epoch": 61.262209744399605, "grad_norm": 0.15477775037288666, "learning_rate": 0.001, "loss": 1.7862, "step": 1047400 }, { "epoch": 61.2680587237527, "grad_norm": 0.1750406175851822, "learning_rate": 0.001, "loss": 1.7927, "step": 1047500 }, { "epoch": 61.273907703105806, "grad_norm": 0.1790078580379486, "learning_rate": 0.001, "loss": 1.7864, "step": 1047600 }, { "epoch": 61.27975668245891, "grad_norm": 0.16427205502986908, "learning_rate": 0.001, "loss": 1.7861, "step": 1047700 }, { "epoch": 61.28560566181201, "grad_norm": 0.18514999747276306, "learning_rate": 0.001, "loss": 1.7828, "step": 1047800 }, { "epoch": 61.29145464116512, "grad_norm": 0.18531356751918793, "learning_rate": 0.001, "loss": 1.7774, "step": 1047900 }, { "epoch": 61.29730362051822, "grad_norm": 0.22642908990383148, "learning_rate": 0.001, "loss": 1.7879, "step": 1048000 }, { "epoch": 61.303152599871325, "grad_norm": 0.17782770097255707, "learning_rate": 0.001, "loss": 1.786, "step": 1048100 }, { "epoch": 61.30900157922443, "grad_norm": 0.24047738313674927, "learning_rate": 0.001, "loss": 1.7892, "step": 1048200 }, { "epoch": 61.314850558577525, "grad_norm": 0.18290100991725922, "learning_rate": 0.001, "loss": 1.7857, "step": 1048300 }, { "epoch": 61.32069953793063, "grad_norm": 0.18468253314495087, "learning_rate": 0.001, "loss": 1.7876, "step": 1048400 }, { "epoch": 61.32654851728373, "grad_norm": 0.16168078780174255, "learning_rate": 0.001, "loss": 1.784, "step": 1048500 }, { "epoch": 61.33239749663684, "grad_norm": 0.1595136821269989, "learning_rate": 0.001, "loss": 1.7908, "step": 1048600 }, { "epoch": 61.33824647598994, "grad_norm": 0.1498240977525711, "learning_rate": 0.001, "loss": 1.7858, "step": 1048700 }, { "epoch": 61.344095455343044, "grad_norm": 0.1841888576745987, "learning_rate": 0.001, "loss": 1.7832, "step": 1048800 }, { "epoch": 61.34994443469615, "grad_norm": 0.2057793140411377, "learning_rate": 0.001, "loss": 1.787, "step": 1048900 }, { "epoch": 61.355793414049245, "grad_norm": 0.15903723239898682, "learning_rate": 0.001, "loss": 1.7857, "step": 1049000 }, { "epoch": 61.36164239340235, "grad_norm": 0.17351578176021576, "learning_rate": 0.001, "loss": 1.7873, "step": 1049100 }, { "epoch": 61.36749137275545, "grad_norm": 0.4333707094192505, "learning_rate": 0.001, "loss": 1.7858, "step": 1049200 }, { "epoch": 61.373340352108556, "grad_norm": 0.15963880717754364, "learning_rate": 0.001, "loss": 1.7923, "step": 1049300 }, { "epoch": 61.37918933146166, "grad_norm": 0.18055792152881622, "learning_rate": 0.001, "loss": 1.7871, "step": 1049400 }, { "epoch": 61.385038310814764, "grad_norm": 0.1643994152545929, "learning_rate": 0.001, "loss": 1.785, "step": 1049500 }, { "epoch": 61.39088729016787, "grad_norm": 0.14633770287036896, "learning_rate": 0.001, "loss": 1.7856, "step": 1049600 }, { "epoch": 61.39673626952097, "grad_norm": 0.15251672267913818, "learning_rate": 0.001, "loss": 1.7884, "step": 1049700 }, { "epoch": 61.40258524887407, "grad_norm": 0.20038241147994995, "learning_rate": 0.001, "loss": 1.7844, "step": 1049800 }, { "epoch": 61.40843422822717, "grad_norm": 0.1993848979473114, "learning_rate": 0.001, "loss": 1.7831, "step": 1049900 }, { "epoch": 61.414283207580276, "grad_norm": 0.1741432547569275, "learning_rate": 0.001, "loss": 1.7885, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_ag_news_accuracy": 0.237828125, "eval_ag_news_bleu_score": 8.275094782805848, "eval_ag_news_bleu_score_sem": 0.615107465979297, "eval_ag_news_emb_cos_sim": 0.7276914119720459, "eval_ag_news_emb_cos_sim_sem": 0.01192792970687151, "eval_ag_news_emb_top1_equal": 0.9609375, "eval_ag_news_emb_top1_equal_sem": 0.017191974446177483, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.7029364109039307, "eval_ag_news_n_ngrams_match_1": 14.78125, "eval_ag_news_n_ngrams_match_2": 4.703125, "eval_ag_news_n_ngrams_match_3": 2.046875, "eval_ag_news_num_pred_words": 46.625, "eval_ag_news_num_true_words": 45.21875, "eval_ag_news_perplexity": 14.923488944504326, "eval_ag_news_pred_num_tokens": 68.6328125, "eval_ag_news_rouge_score": 0.3131729487435059, "eval_ag_news_runtime": 38.6987, "eval_ag_news_samples_per_second": 12.92, "eval_ag_news_steps_per_second": 0.026, "eval_ag_news_token_set_f1": 0.3453316577592552, "eval_ag_news_token_set_f1_sem": 0.009902347599106898, "eval_ag_news_token_set_precision": 0.3297516548371746, "eval_ag_news_token_set_recall": 0.3699053528470698, "eval_ag_news_true_num_tokens": 62.0390625, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_anthropic_toxic_prompts_accuracy": 0.104515625, "eval_anthropic_toxic_prompts_bleu_score": 46.759054517388286, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.730463438637175, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9065582752227783, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008320064283907413, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03347745561491821, "eval_anthropic_toxic_prompts_loss": 1.2741914987564087, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.515625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.15625, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.171875, "eval_anthropic_toxic_prompts_num_pred_words": 14.2421875, "eval_anthropic_toxic_prompts_num_true_words": 13.953125, "eval_anthropic_toxic_prompts_perplexity": 3.5758091948984116, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.3984375, "eval_anthropic_toxic_prompts_rouge_score": 0.7240207271498955, "eval_anthropic_toxic_prompts_runtime": 28.4604, "eval_anthropic_toxic_prompts_samples_per_second": 17.568, "eval_anthropic_toxic_prompts_steps_per_second": 0.035, "eval_anthropic_toxic_prompts_token_set_f1": 0.7375031701023305, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018037528322934527, "eval_anthropic_toxic_prompts_token_set_precision": 0.7415062706685543, "eval_anthropic_toxic_prompts_token_set_recall": 0.7415877038822504, "eval_anthropic_toxic_prompts_true_num_tokens": 17.203125, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_arxiv_accuracy": 0.382171875, "eval_arxiv_bleu_score": 1.875441631149236, "eval_arxiv_bleu_score_sem": 0.20042404774073191, "eval_arxiv_emb_cos_sim": 0.47805479168891907, "eval_arxiv_emb_cos_sim_sem": 0.018833328038454056, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4013919830322266, "eval_arxiv_n_ngrams_match_1": 13.5625, "eval_arxiv_n_ngrams_match_2": 2.6796875, "eval_arxiv_n_ngrams_match_3": 0.5546875, "eval_arxiv_num_pred_words": 53.6953125, "eval_arxiv_num_true_words": 86.5546875, "eval_arxiv_perplexity": 30.005838609184384, "eval_arxiv_pred_num_tokens": 125.421875, "eval_arxiv_rouge_score": 0.18356581022301327, "eval_arxiv_runtime": 30.6032, "eval_arxiv_samples_per_second": 16.338, "eval_arxiv_steps_per_second": 0.033, "eval_arxiv_token_set_f1": 0.18676201446402418, "eval_arxiv_token_set_f1_sem": 0.00804275635202202, "eval_arxiv_token_set_precision": 0.1260962160137554, "eval_arxiv_token_set_recall": 0.43819244817784475, "eval_arxiv_true_num_tokens": 125.609375, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_python_code_alpaca_accuracy": 0.12959375, "eval_python_code_alpaca_bleu_score": 26.624587434544424, "eval_python_code_alpaca_bleu_score_sem": 1.4251215277496159, "eval_python_code_alpaca_emb_cos_sim": 0.8597223162651062, "eval_python_code_alpaca_emb_cos_sim_sem": 0.01046306174248457, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4798551797866821, "eval_python_code_alpaca_n_ngrams_match_1": 10.921875, "eval_python_code_alpaca_n_ngrams_match_2": 5.734375, "eval_python_code_alpaca_n_ngrams_match_3": 3.2890625, "eval_python_code_alpaca_num_pred_words": 18.8125, "eval_python_code_alpaca_num_true_words": 20.1484375, "eval_python_code_alpaca_perplexity": 4.392309539652321, "eval_python_code_alpaca_pred_num_tokens": 25.53125, "eval_python_code_alpaca_rouge_score": 0.5850050401104576, "eval_python_code_alpaca_runtime": 29.1488, "eval_python_code_alpaca_samples_per_second": 17.153, "eval_python_code_alpaca_steps_per_second": 0.034, "eval_python_code_alpaca_token_set_f1": 0.6091905644170477, "eval_python_code_alpaca_token_set_f1_sem": 0.013569454840770087, "eval_python_code_alpaca_token_set_precision": 0.5918251807076979, "eval_python_code_alpaca_token_set_recall": 0.6367894926570256, "eval_python_code_alpaca_true_num_tokens": 25.7421875, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_wikibio_accuracy": 0.367875, "eval_wikibio_bleu_score": 7.6128797757649185, "eval_wikibio_bleu_score_sem": 0.6716121319124468, "eval_wikibio_emb_cos_sim": 0.6120041608810425, "eval_wikibio_emb_cos_sim_sem": 0.02376096136868, "eval_wikibio_emb_top1_equal": 0.890625, "eval_wikibio_emb_top1_equal_sem": 0.02769520878791809, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7041733264923096, "eval_wikibio_n_ngrams_match_1": 13.703125, "eval_wikibio_n_ngrams_match_2": 4.875, "eval_wikibio_n_ngrams_match_3": 2.0078125, "eval_wikibio_num_pred_words": 46.4453125, "eval_wikibio_num_true_words": 49.9296875, "eval_wikibio_perplexity": 14.941959461493049, "eval_wikibio_pred_num_tokens": 103.5859375, "eval_wikibio_rouge_score": 0.30045308439195784, "eval_wikibio_runtime": 31.2642, "eval_wikibio_samples_per_second": 15.993, "eval_wikibio_steps_per_second": 0.032, "eval_wikibio_token_set_f1": 0.32420292577005094, "eval_wikibio_token_set_f1_sem": 0.01284811269713319, "eval_wikibio_token_set_precision": 0.29266393171901556, "eval_wikibio_token_set_recall": 0.40404466567015124, "eval_wikibio_true_num_tokens": 96.75, "step": 1050000 }, { "epoch": 61.414283207580276, "eval_msmarco_accuracy": 0.39146875, "eval_msmarco_bleu_score": 21.45533297567586, "eval_msmarco_bleu_score_sem": 1.7478170317401724, "eval_msmarco_emb_cos_sim": 0.8155355453491211, "eval_msmarco_emb_cos_sim_sem": 0.015170806087553501, "eval_msmarco_emb_top1_equal": 0.9375, "eval_msmarco_emb_top1_equal_sem": 0.02147948183119297, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7289601564407349, "eval_msmarco_n_ngrams_match_1": 30.0546875, "eval_msmarco_n_ngrams_match_2": 15.3984375, "eval_msmarco_n_ngrams_match_3": 9.453125, "eval_msmarco_num_pred_words": 59.3671875, "eval_msmarco_num_true_words": 59.875, "eval_msmarco_perplexity": 5.634791559282832, "eval_msmarco_pred_num_tokens": 82.2890625, "eval_msmarco_rouge_score": 0.4809721685306096, "eval_msmarco_runtime": 26.3031, "eval_msmarco_samples_per_second": 19.009, "eval_msmarco_steps_per_second": 0.038, "eval_msmarco_token_set_f1": 0.5102594462075405, "eval_msmarco_token_set_f1_sem": 0.015330802755235606, "eval_msmarco_token_set_precision": 0.4761940672608085, "eval_msmarco_token_set_recall": 0.5724650073903985, "eval_msmarco_true_num_tokens": 78.125, "step": 1050000 }, { "epoch": 61.42013218693338, "grad_norm": 0.1831505000591278, "learning_rate": 0.001, "loss": 1.7858, "step": 1050100 }, { "epoch": 61.42598116628648, "grad_norm": 0.1722240298986435, "learning_rate": 0.001, "loss": 1.787, "step": 1050200 }, { "epoch": 61.43183014563959, "grad_norm": 0.15150414407253265, "learning_rate": 0.001, "loss": 1.7876, "step": 1050300 }, { "epoch": 61.43767912499269, "grad_norm": 0.15989267826080322, "learning_rate": 0.001, "loss": 1.7878, "step": 1050400 }, { "epoch": 61.443528104345795, "grad_norm": 0.15766195952892303, "learning_rate": 0.001, "loss": 1.7899, "step": 1050500 }, { "epoch": 61.44937708369889, "grad_norm": 0.2013266384601593, "learning_rate": 0.001, "loss": 1.7897, "step": 1050600 }, { "epoch": 61.455226063051995, "grad_norm": 0.189104825258255, "learning_rate": 0.001, "loss": 1.7815, "step": 1050700 }, { "epoch": 61.4610750424051, "grad_norm": 0.20875439047813416, "learning_rate": 0.001, "loss": 1.7919, "step": 1050800 }, { "epoch": 61.4669240217582, "grad_norm": 0.14867857098579407, "learning_rate": 0.001, "loss": 1.7932, "step": 1050900 }, { "epoch": 61.47277300111131, "grad_norm": 0.21397747099399567, "learning_rate": 0.001, "loss": 1.7821, "step": 1051000 }, { "epoch": 61.47862198046441, "grad_norm": 0.17741158604621887, "learning_rate": 0.001, "loss": 1.7837, "step": 1051100 }, { "epoch": 61.484470959817514, "grad_norm": 0.23193655908107758, "learning_rate": 0.001, "loss": 1.792, "step": 1051200 }, { "epoch": 61.49031993917062, "grad_norm": 0.17184923589229584, "learning_rate": 0.001, "loss": 1.7868, "step": 1051300 }, { "epoch": 61.496168918523715, "grad_norm": 0.149400994181633, "learning_rate": 0.001, "loss": 1.7926, "step": 1051400 }, { "epoch": 61.50201789787682, "grad_norm": 0.15418361127376556, "learning_rate": 0.001, "loss": 1.7951, "step": 1051500 }, { "epoch": 61.50786687722992, "grad_norm": 0.16260792315006256, "learning_rate": 0.001, "loss": 1.7935, "step": 1051600 }, { "epoch": 61.513715856583026, "grad_norm": 0.17752037942409515, "learning_rate": 0.001, "loss": 1.788, "step": 1051700 }, { "epoch": 61.51956483593613, "grad_norm": 0.15206705033779144, "learning_rate": 0.001, "loss": 1.791, "step": 1051800 }, { "epoch": 61.525413815289234, "grad_norm": 0.1765832155942917, "learning_rate": 0.001, "loss": 1.7895, "step": 1051900 }, { "epoch": 61.53126279464234, "grad_norm": 0.204890638589859, "learning_rate": 0.001, "loss": 1.7806, "step": 1052000 }, { "epoch": 61.537111773995434, "grad_norm": 0.17380449175834656, "learning_rate": 0.001, "loss": 1.7938, "step": 1052100 }, { "epoch": 61.54296075334854, "grad_norm": 0.21444253623485565, "learning_rate": 0.001, "loss": 1.7858, "step": 1052200 }, { "epoch": 61.54880973270164, "grad_norm": 0.2615300714969635, "learning_rate": 0.001, "loss": 1.7954, "step": 1052300 }, { "epoch": 61.554658712054746, "grad_norm": 0.17310190200805664, "learning_rate": 0.001, "loss": 1.7865, "step": 1052400 }, { "epoch": 61.56050769140785, "grad_norm": 0.16452307999134064, "learning_rate": 0.001, "loss": 1.7868, "step": 1052500 }, { "epoch": 61.56635667076095, "grad_norm": 0.21974365413188934, "learning_rate": 0.001, "loss": 1.7874, "step": 1052600 }, { "epoch": 61.57220565011406, "grad_norm": 0.170737162232399, "learning_rate": 0.001, "loss": 1.7941, "step": 1052700 }, { "epoch": 61.57805462946716, "grad_norm": 0.16132193803787231, "learning_rate": 0.001, "loss": 1.7833, "step": 1052800 }, { "epoch": 61.58390360882026, "grad_norm": 0.16511671245098114, "learning_rate": 0.001, "loss": 1.7885, "step": 1052900 }, { "epoch": 61.58975258817336, "grad_norm": 0.2512401342391968, "learning_rate": 0.001, "loss": 1.7914, "step": 1053000 }, { "epoch": 61.595601567526465, "grad_norm": 0.16852234303951263, "learning_rate": 0.001, "loss": 1.7865, "step": 1053100 }, { "epoch": 61.60145054687957, "grad_norm": 0.18688485026359558, "learning_rate": 0.001, "loss": 1.7872, "step": 1053200 }, { "epoch": 61.60729952623267, "grad_norm": 0.19146926701068878, "learning_rate": 0.001, "loss": 1.794, "step": 1053300 }, { "epoch": 61.61314850558578, "grad_norm": 0.17747513949871063, "learning_rate": 0.001, "loss": 1.7899, "step": 1053400 }, { "epoch": 61.61899748493888, "grad_norm": 0.24220189452171326, "learning_rate": 0.001, "loss": 1.7903, "step": 1053500 }, { "epoch": 61.624846464291984, "grad_norm": 0.18335603177547455, "learning_rate": 0.001, "loss": 1.7873, "step": 1053600 }, { "epoch": 61.63069544364508, "grad_norm": 0.19785304367542267, "learning_rate": 0.001, "loss": 1.7865, "step": 1053700 }, { "epoch": 61.636544422998185, "grad_norm": 0.16035926342010498, "learning_rate": 0.001, "loss": 1.7967, "step": 1053800 }, { "epoch": 61.64239340235129, "grad_norm": 0.14794278144836426, "learning_rate": 0.001, "loss": 1.7901, "step": 1053900 }, { "epoch": 61.64824238170439, "grad_norm": 0.1858944296836853, "learning_rate": 0.001, "loss": 1.7917, "step": 1054000 }, { "epoch": 61.654091361057496, "grad_norm": 0.19895239174365997, "learning_rate": 0.001, "loss": 1.7989, "step": 1054100 }, { "epoch": 61.6599403404106, "grad_norm": 0.17199528217315674, "learning_rate": 0.001, "loss": 1.7916, "step": 1054200 }, { "epoch": 61.665789319763704, "grad_norm": 0.14841638505458832, "learning_rate": 0.001, "loss": 1.7879, "step": 1054300 }, { "epoch": 61.67163829911681, "grad_norm": 0.1749040186405182, "learning_rate": 0.001, "loss": 1.7902, "step": 1054400 }, { "epoch": 61.677487278469904, "grad_norm": 0.20408619940280914, "learning_rate": 0.001, "loss": 1.7916, "step": 1054500 }, { "epoch": 61.68333625782301, "grad_norm": 0.16643279790878296, "learning_rate": 0.001, "loss": 1.7922, "step": 1054600 }, { "epoch": 61.68918523717611, "grad_norm": 0.16357383131980896, "learning_rate": 0.001, "loss": 1.7899, "step": 1054700 }, { "epoch": 61.695034216529216, "grad_norm": 0.1589398831129074, "learning_rate": 0.001, "loss": 1.7904, "step": 1054800 }, { "epoch": 61.70088319588232, "grad_norm": 0.20228509604930878, "learning_rate": 0.001, "loss": 1.7861, "step": 1054900 }, { "epoch": 61.70673217523542, "grad_norm": 0.22109688818454742, "learning_rate": 0.001, "loss": 1.7886, "step": 1055000 }, { "epoch": 61.71258115458853, "grad_norm": 0.24945254623889923, "learning_rate": 0.001, "loss": 1.7965, "step": 1055100 }, { "epoch": 61.718430133941624, "grad_norm": 0.18864372372627258, "learning_rate": 0.001, "loss": 1.789, "step": 1055200 }, { "epoch": 61.72427911329473, "grad_norm": 0.19610227644443512, "learning_rate": 0.001, "loss": 1.7971, "step": 1055300 }, { "epoch": 61.73012809264783, "grad_norm": 0.18000538647174835, "learning_rate": 0.001, "loss": 1.7936, "step": 1055400 }, { "epoch": 61.735977072000935, "grad_norm": 0.17532935738563538, "learning_rate": 0.001, "loss": 1.7888, "step": 1055500 }, { "epoch": 61.74182605135404, "grad_norm": 0.18703337013721466, "learning_rate": 0.001, "loss": 1.7856, "step": 1055600 }, { "epoch": 61.74767503070714, "grad_norm": 0.16637814044952393, "learning_rate": 0.001, "loss": 1.7887, "step": 1055700 }, { "epoch": 61.75352401006025, "grad_norm": 0.1615189015865326, "learning_rate": 0.001, "loss": 1.7852, "step": 1055800 }, { "epoch": 61.75937298941335, "grad_norm": 0.16538193821907043, "learning_rate": 0.001, "loss": 1.7852, "step": 1055900 }, { "epoch": 61.76522196876645, "grad_norm": 0.1732153445482254, "learning_rate": 0.001, "loss": 1.796, "step": 1056000 }, { "epoch": 61.77107094811955, "grad_norm": 0.18862619996070862, "learning_rate": 0.001, "loss": 1.7901, "step": 1056100 }, { "epoch": 61.776919927472655, "grad_norm": 0.20275136828422546, "learning_rate": 0.001, "loss": 1.7889, "step": 1056200 }, { "epoch": 61.78276890682576, "grad_norm": 0.18113096058368683, "learning_rate": 0.001, "loss": 1.7981, "step": 1056300 }, { "epoch": 61.78861788617886, "grad_norm": 0.16857191920280457, "learning_rate": 0.001, "loss": 1.7964, "step": 1056400 }, { "epoch": 61.794466865531966, "grad_norm": 0.1563461273908615, "learning_rate": 0.001, "loss": 1.7922, "step": 1056500 }, { "epoch": 61.80031584488507, "grad_norm": 0.20138248801231384, "learning_rate": 0.001, "loss": 1.7925, "step": 1056600 }, { "epoch": 61.806164824238174, "grad_norm": 0.1730760782957077, "learning_rate": 0.001, "loss": 1.7942, "step": 1056700 }, { "epoch": 61.81201380359127, "grad_norm": 0.25843557715415955, "learning_rate": 0.001, "loss": 1.7926, "step": 1056800 }, { "epoch": 61.817862782944374, "grad_norm": 0.17796343564987183, "learning_rate": 0.001, "loss": 1.7922, "step": 1056900 }, { "epoch": 61.82371176229748, "grad_norm": 0.1973131000995636, "learning_rate": 0.001, "loss": 1.7885, "step": 1057000 }, { "epoch": 61.82956074165058, "grad_norm": 0.3418956696987152, "learning_rate": 0.001, "loss": 1.7913, "step": 1057100 }, { "epoch": 61.835409721003685, "grad_norm": 0.16824866831302643, "learning_rate": 0.001, "loss": 1.7917, "step": 1057200 }, { "epoch": 61.84125870035679, "grad_norm": 0.1635952889919281, "learning_rate": 0.001, "loss": 1.7903, "step": 1057300 }, { "epoch": 61.84710767970989, "grad_norm": 0.15138275921344757, "learning_rate": 0.001, "loss": 1.7902, "step": 1057400 }, { "epoch": 61.852956659063, "grad_norm": 0.18089857697486877, "learning_rate": 0.001, "loss": 1.7904, "step": 1057500 }, { "epoch": 61.858805638416094, "grad_norm": 0.1928587108850479, "learning_rate": 0.001, "loss": 1.792, "step": 1057600 }, { "epoch": 61.8646546177692, "grad_norm": 0.2106967270374298, "learning_rate": 0.001, "loss": 1.7896, "step": 1057700 }, { "epoch": 61.8705035971223, "grad_norm": 0.2679547071456909, "learning_rate": 0.001, "loss": 1.791, "step": 1057800 }, { "epoch": 61.876352576475405, "grad_norm": 0.16180989146232605, "learning_rate": 0.001, "loss": 1.7888, "step": 1057900 }, { "epoch": 61.88220155582851, "grad_norm": 0.16328230500221252, "learning_rate": 0.001, "loss": 1.7865, "step": 1058000 }, { "epoch": 61.88805053518161, "grad_norm": 0.16092057526111603, "learning_rate": 0.001, "loss": 1.7937, "step": 1058100 }, { "epoch": 61.893899514534716, "grad_norm": 0.25329333543777466, "learning_rate": 0.001, "loss": 1.7942, "step": 1058200 }, { "epoch": 61.89974849388781, "grad_norm": 0.15376345813274384, "learning_rate": 0.001, "loss": 1.7926, "step": 1058300 }, { "epoch": 61.90559747324092, "grad_norm": 0.1920117884874344, "learning_rate": 0.001, "loss": 1.7965, "step": 1058400 }, { "epoch": 61.91144645259402, "grad_norm": 0.17509102821350098, "learning_rate": 0.001, "loss": 1.7824, "step": 1058500 }, { "epoch": 61.917295431947124, "grad_norm": 0.1830456405878067, "learning_rate": 0.001, "loss": 1.7859, "step": 1058600 }, { "epoch": 61.92314441130023, "grad_norm": 0.20337702333927155, "learning_rate": 0.001, "loss": 1.7909, "step": 1058700 }, { "epoch": 61.92899339065333, "grad_norm": 0.15559719502925873, "learning_rate": 0.001, "loss": 1.7862, "step": 1058800 }, { "epoch": 61.934842370006436, "grad_norm": 0.15577416121959686, "learning_rate": 0.001, "loss": 1.7875, "step": 1058900 }, { "epoch": 61.94069134935954, "grad_norm": 0.17377734184265137, "learning_rate": 0.001, "loss": 1.7887, "step": 1059000 }, { "epoch": 61.946540328712636, "grad_norm": 0.17599424719810486, "learning_rate": 0.001, "loss": 1.7878, "step": 1059100 }, { "epoch": 61.95238930806574, "grad_norm": 0.14227819442749023, "learning_rate": 0.001, "loss": 1.7869, "step": 1059200 }, { "epoch": 61.958238287418844, "grad_norm": 0.16979296505451202, "learning_rate": 0.001, "loss": 1.7965, "step": 1059300 }, { "epoch": 61.96408726677195, "grad_norm": 0.23160773515701294, "learning_rate": 0.001, "loss": 1.7916, "step": 1059400 }, { "epoch": 61.96993624612505, "grad_norm": 0.17531585693359375, "learning_rate": 0.001, "loss": 1.7922, "step": 1059500 }, { "epoch": 61.975785225478155, "grad_norm": 0.19284136593341827, "learning_rate": 0.001, "loss": 1.7939, "step": 1059600 }, { "epoch": 61.98163420483126, "grad_norm": 0.16241143643856049, "learning_rate": 0.001, "loss": 1.7872, "step": 1059700 }, { "epoch": 61.98748318418436, "grad_norm": 0.1580924391746521, "learning_rate": 0.001, "loss": 1.7934, "step": 1059800 }, { "epoch": 61.99333216353746, "grad_norm": 0.16429150104522705, "learning_rate": 0.001, "loss": 1.7927, "step": 1059900 }, { "epoch": 61.99918114289056, "grad_norm": 0.18666981160640717, "learning_rate": 0.001, "loss": 1.7947, "step": 1060000 }, { "epoch": 62.00503012224367, "grad_norm": 0.1719045788049698, "learning_rate": 0.001, "loss": 1.779, "step": 1060100 }, { "epoch": 62.01087910159677, "grad_norm": 0.21749915182590485, "learning_rate": 0.001, "loss": 1.7729, "step": 1060200 }, { "epoch": 62.016728080949875, "grad_norm": 0.23026399314403534, "learning_rate": 0.001, "loss": 1.7831, "step": 1060300 }, { "epoch": 62.02257706030298, "grad_norm": 0.2183799296617508, "learning_rate": 0.001, "loss": 1.7798, "step": 1060400 }, { "epoch": 62.02842603965608, "grad_norm": 0.19185714423656464, "learning_rate": 0.001, "loss": 1.78, "step": 1060500 }, { "epoch": 62.034275019009186, "grad_norm": 0.22583137452602386, "learning_rate": 0.001, "loss": 1.7827, "step": 1060600 }, { "epoch": 62.04012399836228, "grad_norm": 0.22861959040164948, "learning_rate": 0.001, "loss": 1.7813, "step": 1060700 }, { "epoch": 62.04597297771539, "grad_norm": 0.18779587745666504, "learning_rate": 0.001, "loss": 1.7821, "step": 1060800 }, { "epoch": 62.05182195706849, "grad_norm": 0.3117152154445648, "learning_rate": 0.001, "loss": 1.7889, "step": 1060900 }, { "epoch": 62.057670936421594, "grad_norm": 0.2826651930809021, "learning_rate": 0.001, "loss": 1.7801, "step": 1061000 }, { "epoch": 62.0635199157747, "grad_norm": 0.2156379520893097, "learning_rate": 0.001, "loss": 1.7835, "step": 1061100 }, { "epoch": 62.0693688951278, "grad_norm": 0.24757851660251617, "learning_rate": 0.001, "loss": 1.781, "step": 1061200 }, { "epoch": 62.075217874480906, "grad_norm": 0.17734739184379578, "learning_rate": 0.001, "loss": 1.7814, "step": 1061300 }, { "epoch": 62.081066853834, "grad_norm": 0.22264617681503296, "learning_rate": 0.001, "loss": 1.7791, "step": 1061400 }, { "epoch": 62.086915833187106, "grad_norm": 0.21444550156593323, "learning_rate": 0.001, "loss": 1.7823, "step": 1061500 }, { "epoch": 62.09276481254021, "grad_norm": 0.1541289985179901, "learning_rate": 0.001, "loss": 1.7855, "step": 1061600 }, { "epoch": 62.098613791893314, "grad_norm": 0.20319953560829163, "learning_rate": 0.001, "loss": 1.7832, "step": 1061700 }, { "epoch": 62.10446277124642, "grad_norm": 0.21605263650417328, "learning_rate": 0.001, "loss": 1.787, "step": 1061800 }, { "epoch": 62.11031175059952, "grad_norm": 0.1666560024023056, "learning_rate": 0.001, "loss": 1.7814, "step": 1061900 }, { "epoch": 62.116160729952625, "grad_norm": 0.20101569592952728, "learning_rate": 0.001, "loss": 1.7746, "step": 1062000 }, { "epoch": 62.12200970930573, "grad_norm": 0.18712687492370605, "learning_rate": 0.001, "loss": 1.7845, "step": 1062100 }, { "epoch": 62.127858688658826, "grad_norm": 0.17261196672916412, "learning_rate": 0.001, "loss": 1.7869, "step": 1062200 }, { "epoch": 62.13370766801193, "grad_norm": 0.17861846089363098, "learning_rate": 0.001, "loss": 1.7815, "step": 1062300 }, { "epoch": 62.13955664736503, "grad_norm": 0.20630104839801788, "learning_rate": 0.001, "loss": 1.7792, "step": 1062400 }, { "epoch": 62.14540562671814, "grad_norm": 0.17240604758262634, "learning_rate": 0.001, "loss": 1.7798, "step": 1062500 }, { "epoch": 62.15125460607124, "grad_norm": 0.20401960611343384, "learning_rate": 0.001, "loss": 1.7821, "step": 1062600 }, { "epoch": 62.157103585424345, "grad_norm": 0.20938892662525177, "learning_rate": 0.001, "loss": 1.7808, "step": 1062700 }, { "epoch": 62.16295256477745, "grad_norm": 0.2444562315940857, "learning_rate": 0.001, "loss": 1.7778, "step": 1062800 }, { "epoch": 62.16880154413055, "grad_norm": 0.2374412566423416, "learning_rate": 0.001, "loss": 1.7769, "step": 1062900 }, { "epoch": 62.17465052348365, "grad_norm": 0.18706412613391876, "learning_rate": 0.001, "loss": 1.7793, "step": 1063000 }, { "epoch": 62.18049950283675, "grad_norm": 0.19779275357723236, "learning_rate": 0.001, "loss": 1.7829, "step": 1063100 }, { "epoch": 62.18634848218986, "grad_norm": 0.34205660223960876, "learning_rate": 0.001, "loss": 1.7778, "step": 1063200 }, { "epoch": 62.19219746154296, "grad_norm": 0.27463456988334656, "learning_rate": 0.001, "loss": 1.7806, "step": 1063300 }, { "epoch": 62.198046440896064, "grad_norm": 0.18531204760074615, "learning_rate": 0.001, "loss": 1.7829, "step": 1063400 }, { "epoch": 62.20389542024917, "grad_norm": 0.21067793667316437, "learning_rate": 0.001, "loss": 1.7844, "step": 1063500 }, { "epoch": 62.20974439960227, "grad_norm": 0.19103099405765533, "learning_rate": 0.001, "loss": 1.7769, "step": 1063600 }, { "epoch": 62.215593378955376, "grad_norm": 0.22127704322338104, "learning_rate": 0.001, "loss": 1.7857, "step": 1063700 }, { "epoch": 62.22144235830847, "grad_norm": 0.19143430888652802, "learning_rate": 0.001, "loss": 1.7857, "step": 1063800 }, { "epoch": 62.227291337661576, "grad_norm": 0.16568224132061005, "learning_rate": 0.001, "loss": 1.782, "step": 1063900 }, { "epoch": 62.23314031701468, "grad_norm": 0.17801041901111603, "learning_rate": 0.001, "loss": 1.7804, "step": 1064000 }, { "epoch": 62.238989296367784, "grad_norm": 0.21664990484714508, "learning_rate": 0.001, "loss": 1.7862, "step": 1064100 }, { "epoch": 62.24483827572089, "grad_norm": 0.22218991816043854, "learning_rate": 0.001, "loss": 1.781, "step": 1064200 }, { "epoch": 62.25068725507399, "grad_norm": 0.17854627966880798, "learning_rate": 0.001, "loss": 1.7895, "step": 1064300 }, { "epoch": 62.256536234427095, "grad_norm": 0.1684245616197586, "learning_rate": 0.001, "loss": 1.781, "step": 1064400 }, { "epoch": 62.26238521378019, "grad_norm": 0.20941367745399475, "learning_rate": 0.001, "loss": 1.7842, "step": 1064500 }, { "epoch": 62.268234193133296, "grad_norm": 0.15839038789272308, "learning_rate": 0.001, "loss": 1.7852, "step": 1064600 }, { "epoch": 62.2740831724864, "grad_norm": 0.19726014137268066, "learning_rate": 0.001, "loss": 1.7847, "step": 1064700 }, { "epoch": 62.2799321518395, "grad_norm": 0.15677863359451294, "learning_rate": 0.001, "loss": 1.7826, "step": 1064800 }, { "epoch": 62.28578113119261, "grad_norm": 0.21386538445949554, "learning_rate": 0.001, "loss": 1.7834, "step": 1064900 }, { "epoch": 62.29163011054571, "grad_norm": 0.23674900829792023, "learning_rate": 0.001, "loss": 1.7862, "step": 1065000 }, { "epoch": 62.297479089898815, "grad_norm": 0.1861664056777954, "learning_rate": 0.001, "loss": 1.7859, "step": 1065100 }, { "epoch": 62.30332806925192, "grad_norm": 0.19562223553657532, "learning_rate": 0.001, "loss": 1.7837, "step": 1065200 }, { "epoch": 62.309177048605015, "grad_norm": 0.16591133177280426, "learning_rate": 0.001, "loss": 1.7809, "step": 1065300 }, { "epoch": 62.31502602795812, "grad_norm": 0.16914908587932587, "learning_rate": 0.001, "loss": 1.7866, "step": 1065400 }, { "epoch": 62.32087500731122, "grad_norm": 0.18975555896759033, "learning_rate": 0.001, "loss": 1.7893, "step": 1065500 }, { "epoch": 62.32672398666433, "grad_norm": 0.18538245558738708, "learning_rate": 0.001, "loss": 1.7866, "step": 1065600 }, { "epoch": 62.33257296601743, "grad_norm": 0.19540226459503174, "learning_rate": 0.001, "loss": 1.7894, "step": 1065700 }, { "epoch": 62.338421945370534, "grad_norm": 0.1764564961194992, "learning_rate": 0.001, "loss": 1.7839, "step": 1065800 }, { "epoch": 62.34427092472364, "grad_norm": 0.17543156445026398, "learning_rate": 0.001, "loss": 1.7857, "step": 1065900 }, { "epoch": 62.35011990407674, "grad_norm": 0.23686841130256653, "learning_rate": 0.001, "loss": 1.7839, "step": 1066000 }, { "epoch": 62.35596888342984, "grad_norm": 0.2596539556980133, "learning_rate": 0.001, "loss": 1.7849, "step": 1066100 }, { "epoch": 62.36181786278294, "grad_norm": 0.2457362413406372, "learning_rate": 0.001, "loss": 1.7916, "step": 1066200 }, { "epoch": 62.367666842136046, "grad_norm": 0.20114417374134064, "learning_rate": 0.001, "loss": 1.7852, "step": 1066300 }, { "epoch": 62.37351582148915, "grad_norm": 0.18268299102783203, "learning_rate": 0.001, "loss": 1.7849, "step": 1066400 }, { "epoch": 62.379364800842254, "grad_norm": 0.17880168557167053, "learning_rate": 0.001, "loss": 1.7853, "step": 1066500 }, { "epoch": 62.38521378019536, "grad_norm": 0.18505094945430756, "learning_rate": 0.001, "loss": 1.7859, "step": 1066600 }, { "epoch": 62.39106275954846, "grad_norm": 0.19995014369487762, "learning_rate": 0.001, "loss": 1.7797, "step": 1066700 }, { "epoch": 62.396911738901565, "grad_norm": 0.17162573337554932, "learning_rate": 0.001, "loss": 1.7854, "step": 1066800 }, { "epoch": 62.40276071825466, "grad_norm": 0.19219155609607697, "learning_rate": 0.001, "loss": 1.7823, "step": 1066900 }, { "epoch": 62.408609697607766, "grad_norm": 0.18437671661376953, "learning_rate": 0.001, "loss": 1.7822, "step": 1067000 }, { "epoch": 62.41445867696087, "grad_norm": 0.2246115654706955, "learning_rate": 0.001, "loss": 1.7822, "step": 1067100 }, { "epoch": 62.42030765631397, "grad_norm": 0.21140921115875244, "learning_rate": 0.001, "loss": 1.7858, "step": 1067200 }, { "epoch": 62.42615663566708, "grad_norm": 0.1684417724609375, "learning_rate": 0.001, "loss": 1.792, "step": 1067300 }, { "epoch": 62.43200561502018, "grad_norm": 0.23196928203105927, "learning_rate": 0.001, "loss": 1.782, "step": 1067400 }, { "epoch": 62.437854594373285, "grad_norm": 0.17843414843082428, "learning_rate": 0.001, "loss": 1.7894, "step": 1067500 }, { "epoch": 62.44370357372638, "grad_norm": 0.1539241224527359, "learning_rate": 0.001, "loss": 1.7894, "step": 1067600 }, { "epoch": 62.449552553079485, "grad_norm": 0.1978389471769333, "learning_rate": 0.001, "loss": 1.7854, "step": 1067700 }, { "epoch": 62.45540153243259, "grad_norm": 0.20733866095542908, "learning_rate": 0.001, "loss": 1.7838, "step": 1067800 }, { "epoch": 62.46125051178569, "grad_norm": 0.16286703944206238, "learning_rate": 0.001, "loss": 1.7882, "step": 1067900 }, { "epoch": 62.4670994911388, "grad_norm": 0.25245237350463867, "learning_rate": 0.001, "loss": 1.786, "step": 1068000 }, { "epoch": 62.4729484704919, "grad_norm": 0.1762998402118683, "learning_rate": 0.001, "loss": 1.7875, "step": 1068100 }, { "epoch": 62.478797449845004, "grad_norm": 0.2579391598701477, "learning_rate": 0.001, "loss": 1.7873, "step": 1068200 }, { "epoch": 62.48464642919811, "grad_norm": 0.23730824887752533, "learning_rate": 0.001, "loss": 1.7862, "step": 1068300 }, { "epoch": 62.490495408551205, "grad_norm": 0.1701381355524063, "learning_rate": 0.001, "loss": 1.7899, "step": 1068400 }, { "epoch": 62.49634438790431, "grad_norm": 0.20213064551353455, "learning_rate": 0.001, "loss": 1.7947, "step": 1068500 }, { "epoch": 62.50219336725741, "grad_norm": 0.20831800997257233, "learning_rate": 0.001, "loss": 1.7905, "step": 1068600 }, { "epoch": 62.508042346610516, "grad_norm": 0.18143031001091003, "learning_rate": 0.001, "loss": 1.7878, "step": 1068700 }, { "epoch": 62.51389132596362, "grad_norm": 0.23916034400463104, "learning_rate": 0.001, "loss": 1.7849, "step": 1068800 }, { "epoch": 62.51974030531672, "grad_norm": 0.229106605052948, "learning_rate": 0.001, "loss": 1.7884, "step": 1068900 }, { "epoch": 62.52558928466983, "grad_norm": 0.15813949704170227, "learning_rate": 0.001, "loss": 1.7886, "step": 1069000 }, { "epoch": 62.53143826402293, "grad_norm": 0.19100475311279297, "learning_rate": 0.001, "loss": 1.7856, "step": 1069100 }, { "epoch": 62.53728724337603, "grad_norm": 0.18782617151737213, "learning_rate": 0.001, "loss": 1.778, "step": 1069200 }, { "epoch": 62.54313622272913, "grad_norm": 0.20714999735355377, "learning_rate": 0.001, "loss": 1.7884, "step": 1069300 }, { "epoch": 62.548985202082235, "grad_norm": 0.19228117167949677, "learning_rate": 0.001, "loss": 1.7853, "step": 1069400 }, { "epoch": 62.55483418143534, "grad_norm": 0.17576108872890472, "learning_rate": 0.001, "loss": 1.7876, "step": 1069500 }, { "epoch": 62.56068316078844, "grad_norm": 0.16070155799388885, "learning_rate": 0.001, "loss": 1.7881, "step": 1069600 }, { "epoch": 62.56653214014155, "grad_norm": 0.2234620898962021, "learning_rate": 0.001, "loss": 1.7869, "step": 1069700 }, { "epoch": 62.57238111949465, "grad_norm": 0.16548343002796173, "learning_rate": 0.001, "loss": 1.7876, "step": 1069800 }, { "epoch": 62.578230098847754, "grad_norm": 0.18662409484386444, "learning_rate": 0.001, "loss": 1.7884, "step": 1069900 }, { "epoch": 62.58407907820085, "grad_norm": 0.2830894887447357, "learning_rate": 0.001, "loss": 1.7853, "step": 1070000 }, { "epoch": 62.589928057553955, "grad_norm": 0.18761643767356873, "learning_rate": 0.001, "loss": 1.784, "step": 1070100 }, { "epoch": 62.59577703690706, "grad_norm": 0.20276466012001038, "learning_rate": 0.001, "loss": 1.7898, "step": 1070200 }, { "epoch": 62.60162601626016, "grad_norm": 0.15851080417633057, "learning_rate": 0.001, "loss": 1.7856, "step": 1070300 }, { "epoch": 62.607474995613266, "grad_norm": 0.17673277854919434, "learning_rate": 0.001, "loss": 1.7955, "step": 1070400 }, { "epoch": 62.61332397496637, "grad_norm": 0.21649539470672607, "learning_rate": 0.001, "loss": 1.7794, "step": 1070500 }, { "epoch": 62.619172954319474, "grad_norm": 0.19002032279968262, "learning_rate": 0.001, "loss": 1.7809, "step": 1070600 }, { "epoch": 62.62502193367257, "grad_norm": 0.24950480461120605, "learning_rate": 0.001, "loss": 1.7864, "step": 1070700 }, { "epoch": 62.630870913025674, "grad_norm": 0.19906073808670044, "learning_rate": 0.001, "loss": 1.7891, "step": 1070800 }, { "epoch": 62.63671989237878, "grad_norm": 0.19245857000350952, "learning_rate": 0.001, "loss": 1.7855, "step": 1070900 }, { "epoch": 62.64256887173188, "grad_norm": 0.18944643437862396, "learning_rate": 0.001, "loss": 1.786, "step": 1071000 }, { "epoch": 62.648417851084986, "grad_norm": 0.19934572279453278, "learning_rate": 0.001, "loss": 1.7853, "step": 1071100 }, { "epoch": 62.65426683043809, "grad_norm": 0.19188904762268066, "learning_rate": 0.001, "loss": 1.7824, "step": 1071200 }, { "epoch": 62.66011580979119, "grad_norm": 0.172365203499794, "learning_rate": 0.001, "loss": 1.7858, "step": 1071300 }, { "epoch": 62.6659647891443, "grad_norm": 0.18273283541202545, "learning_rate": 0.001, "loss": 1.7909, "step": 1071400 }, { "epoch": 62.671813768497394, "grad_norm": 0.19171948730945587, "learning_rate": 0.001, "loss": 1.7884, "step": 1071500 }, { "epoch": 62.6776627478505, "grad_norm": 0.20073537528514862, "learning_rate": 0.001, "loss": 1.789, "step": 1071600 }, { "epoch": 62.6835117272036, "grad_norm": 0.2583363652229309, "learning_rate": 0.001, "loss": 1.7908, "step": 1071700 }, { "epoch": 62.689360706556705, "grad_norm": 0.18296009302139282, "learning_rate": 0.001, "loss": 1.7919, "step": 1071800 }, { "epoch": 62.69520968590981, "grad_norm": 0.19336465001106262, "learning_rate": 0.001, "loss": 1.7852, "step": 1071900 }, { "epoch": 62.70105866526291, "grad_norm": 0.1854369044303894, "learning_rate": 0.001, "loss": 1.7987, "step": 1072000 }, { "epoch": 62.70690764461602, "grad_norm": 0.16693556308746338, "learning_rate": 0.001, "loss": 1.7932, "step": 1072100 }, { "epoch": 62.71275662396912, "grad_norm": 0.18951214849948883, "learning_rate": 0.001, "loss": 1.7871, "step": 1072200 }, { "epoch": 62.71860560332222, "grad_norm": 0.26535966992378235, "learning_rate": 0.001, "loss": 1.7865, "step": 1072300 }, { "epoch": 62.72445458267532, "grad_norm": 0.2032579779624939, "learning_rate": 0.001, "loss": 1.7927, "step": 1072400 }, { "epoch": 62.730303562028425, "grad_norm": 0.18657103180885315, "learning_rate": 0.001, "loss": 1.7868, "step": 1072500 }, { "epoch": 62.73615254138153, "grad_norm": 0.23923975229263306, "learning_rate": 0.001, "loss": 1.7879, "step": 1072600 }, { "epoch": 62.74200152073463, "grad_norm": 0.20687149465084076, "learning_rate": 0.001, "loss": 1.7907, "step": 1072700 }, { "epoch": 62.747850500087736, "grad_norm": 0.18671005964279175, "learning_rate": 0.001, "loss": 1.7848, "step": 1072800 }, { "epoch": 62.75369947944084, "grad_norm": 0.19455866515636444, "learning_rate": 0.001, "loss": 1.7885, "step": 1072900 }, { "epoch": 62.759548458793944, "grad_norm": 0.24705442786216736, "learning_rate": 0.001, "loss": 1.7917, "step": 1073000 }, { "epoch": 62.76539743814704, "grad_norm": 0.1691669374704361, "learning_rate": 0.001, "loss": 1.7845, "step": 1073100 }, { "epoch": 62.771246417500144, "grad_norm": 0.19370077550411224, "learning_rate": 0.001, "loss": 1.7815, "step": 1073200 }, { "epoch": 62.77709539685325, "grad_norm": 0.1832215040922165, "learning_rate": 0.001, "loss": 1.7865, "step": 1073300 }, { "epoch": 62.78294437620635, "grad_norm": 0.17467208206653595, "learning_rate": 0.001, "loss": 1.7901, "step": 1073400 }, { "epoch": 62.788793355559456, "grad_norm": 0.19815242290496826, "learning_rate": 0.001, "loss": 1.7854, "step": 1073500 }, { "epoch": 62.79464233491256, "grad_norm": 0.18078657984733582, "learning_rate": 0.001, "loss": 1.7878, "step": 1073600 }, { "epoch": 62.80049131426566, "grad_norm": 0.19644124805927277, "learning_rate": 0.001, "loss": 1.7853, "step": 1073700 }, { "epoch": 62.80634029361876, "grad_norm": 0.1778232604265213, "learning_rate": 0.001, "loss": 1.7909, "step": 1073800 }, { "epoch": 62.812189272971864, "grad_norm": 0.18804122507572174, "learning_rate": 0.001, "loss": 1.7905, "step": 1073900 }, { "epoch": 62.81803825232497, "grad_norm": 0.1886914074420929, "learning_rate": 0.001, "loss": 1.7899, "step": 1074000 }, { "epoch": 62.82388723167807, "grad_norm": 0.17944656312465668, "learning_rate": 0.001, "loss": 1.7903, "step": 1074100 }, { "epoch": 62.829736211031175, "grad_norm": 0.2216186374425888, "learning_rate": 0.001, "loss": 1.7859, "step": 1074200 }, { "epoch": 62.83558519038428, "grad_norm": 0.16260738670825958, "learning_rate": 0.001, "loss": 1.7834, "step": 1074300 }, { "epoch": 62.84143416973738, "grad_norm": 0.20492808520793915, "learning_rate": 0.001, "loss": 1.7849, "step": 1074400 }, { "epoch": 62.84728314909049, "grad_norm": 0.1964171826839447, "learning_rate": 0.001, "loss": 1.7876, "step": 1074500 }, { "epoch": 62.85313212844358, "grad_norm": 0.20547890663146973, "learning_rate": 0.001, "loss": 1.7923, "step": 1074600 }, { "epoch": 62.85898110779669, "grad_norm": 0.16920751333236694, "learning_rate": 0.001, "loss": 1.7888, "step": 1074700 }, { "epoch": 62.86483008714979, "grad_norm": 0.179753378033638, "learning_rate": 0.001, "loss": 1.7886, "step": 1074800 }, { "epoch": 62.870679066502895, "grad_norm": 0.20820574462413788, "learning_rate": 0.001, "loss": 1.7926, "step": 1074900 }, { "epoch": 62.876528045856, "grad_norm": 0.27514052391052246, "learning_rate": 0.001, "loss": 1.7868, "step": 1075000 }, { "epoch": 62.876528045856, "eval_ag_news_accuracy": 0.247078125, "eval_ag_news_bleu_score": 7.154741121432783, "eval_ag_news_bleu_score_sem": 0.4769748770021314, "eval_ag_news_emb_cos_sim": 0.7251813411712646, "eval_ag_news_emb_cos_sim_sem": 0.012735005468130112, "eval_ag_news_emb_top1_equal": 0.96875, "eval_ag_news_emb_top1_equal_sem": 0.01543935015797615, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6790995597839355, "eval_ag_news_n_ngrams_match_1": 14.7265625, "eval_ag_news_n_ngrams_match_2": 4.1953125, "eval_ag_news_n_ngrams_match_3": 1.578125, "eval_ag_news_num_pred_words": 48.2421875, "eval_ag_news_num_true_words": 44.6875, "eval_ag_news_perplexity": 14.571966202290566, "eval_ag_news_pred_num_tokens": 70.265625, "eval_ag_news_rouge_score": 0.2995939949672194, "eval_ag_news_runtime": 40.0239, "eval_ag_news_samples_per_second": 12.493, "eval_ag_news_steps_per_second": 0.025, "eval_ag_news_token_set_f1": 0.3373225857649064, "eval_ag_news_token_set_f1_sem": 0.00936176117544336, "eval_ag_news_token_set_precision": 0.3248809968457997, "eval_ag_news_token_set_recall": 0.3572396497730653, "eval_ag_news_true_num_tokens": 62.3828125, "step": 1075000 }, { "epoch": 62.876528045856, "eval_anthropic_toxic_prompts_accuracy": 0.105671875, "eval_anthropic_toxic_prompts_bleu_score": 46.35477174487304, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.8408390788794295, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9065923690795898, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.008950174786150455, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.984375, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.011004959233105183, "eval_anthropic_toxic_prompts_exact_match": 0.1953125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03517845700641747, "eval_anthropic_toxic_prompts_loss": 1.2652984857559204, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.3515625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.796875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.828125, "eval_anthropic_toxic_prompts_num_pred_words": 14.7265625, "eval_anthropic_toxic_prompts_num_true_words": 14.1015625, "eval_anthropic_toxic_prompts_perplexity": 3.5441504566720234, "eval_anthropic_toxic_prompts_pred_num_tokens": 18.46875, "eval_anthropic_toxic_prompts_rouge_score": 0.7143815393686461, "eval_anthropic_toxic_prompts_runtime": 32.9917, "eval_anthropic_toxic_prompts_samples_per_second": 15.155, "eval_anthropic_toxic_prompts_steps_per_second": 0.03, "eval_anthropic_toxic_prompts_token_set_f1": 0.7343452729560189, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018275566827435332, "eval_anthropic_toxic_prompts_token_set_precision": 0.7389340854427539, "eval_anthropic_toxic_prompts_token_set_recall": 0.7348326208261722, "eval_anthropic_toxic_prompts_true_num_tokens": 17.3046875, "step": 1075000 }, { "epoch": 62.876528045856, "eval_arxiv_accuracy": 0.377859375, "eval_arxiv_bleu_score": 1.8887972478328905, "eval_arxiv_bleu_score_sem": 0.16522483241197886, "eval_arxiv_emb_cos_sim": 0.47465208172798157, "eval_arxiv_emb_cos_sim_sem": 0.018807033076882362, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4338572025299072, "eval_arxiv_n_ngrams_match_1": 14.1484375, "eval_arxiv_n_ngrams_match_2": 2.5859375, "eval_arxiv_n_ngrams_match_3": 0.59375, "eval_arxiv_num_pred_words": 59.5234375, "eval_arxiv_num_true_words": 86.3046875, "eval_arxiv_perplexity": 30.995970201334117, "eval_arxiv_pred_num_tokens": 125.9140625, "eval_arxiv_rouge_score": 0.1846109716512908, "eval_arxiv_runtime": 34.9914, "eval_arxiv_samples_per_second": 14.289, "eval_arxiv_steps_per_second": 0.029, "eval_arxiv_token_set_f1": 0.1872800300723365, "eval_arxiv_token_set_f1_sem": 0.007135905321069458, "eval_arxiv_token_set_precision": 0.12828967029782354, "eval_arxiv_token_set_recall": 0.4103008726327719, "eval_arxiv_true_num_tokens": 125.6171875, "step": 1075000 }, { "epoch": 62.876528045856, "eval_python_code_alpaca_accuracy": 0.131140625, "eval_python_code_alpaca_bleu_score": 29.638501795136182, "eval_python_code_alpaca_bleu_score_sem": 1.7388930719557671, "eval_python_code_alpaca_emb_cos_sim": 0.8807464838027954, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008658220991492271, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0, "eval_python_code_alpaca_exact_match_sem": 0.0, "eval_python_code_alpaca_loss": 1.4154330492019653, "eval_python_code_alpaca_n_ngrams_match_1": 11.3125, "eval_python_code_alpaca_n_ngrams_match_2": 6.2890625, "eval_python_code_alpaca_n_ngrams_match_3": 3.6484375, "eval_python_code_alpaca_num_pred_words": 18.625, "eval_python_code_alpaca_num_true_words": 19.484375, "eval_python_code_alpaca_perplexity": 4.118269493193643, "eval_python_code_alpaca_pred_num_tokens": 24.140625, "eval_python_code_alpaca_rouge_score": 0.6159529319679617, "eval_python_code_alpaca_runtime": 34.4523, "eval_python_code_alpaca_samples_per_second": 14.513, "eval_python_code_alpaca_steps_per_second": 0.029, "eval_python_code_alpaca_token_set_f1": 0.6296000704122204, "eval_python_code_alpaca_token_set_f1_sem": 0.014897546774938377, "eval_python_code_alpaca_token_set_precision": 0.6208174869431932, "eval_python_code_alpaca_token_set_recall": 0.6445553973135634, "eval_python_code_alpaca_true_num_tokens": 24.71875, "step": 1075000 }, { "epoch": 62.876528045856, "eval_wikibio_accuracy": 0.370265625, "eval_wikibio_bleu_score": 9.014752905290974, "eval_wikibio_bleu_score_sem": 0.7909258632798659, "eval_wikibio_emb_cos_sim": 0.6212046146392822, "eval_wikibio_emb_cos_sim_sem": 0.022217286750674248, "eval_wikibio_emb_top1_equal": 0.90625, "eval_wikibio_emb_top1_equal_sem": 0.025864720344543457, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6749653816223145, "eval_wikibio_n_ngrams_match_1": 16.6015625, "eval_wikibio_n_ngrams_match_2": 5.8515625, "eval_wikibio_n_ngrams_match_3": 2.671875, "eval_wikibio_num_pred_words": 54.796875, "eval_wikibio_num_true_words": 53.2109375, "eval_wikibio_perplexity": 14.511847454278994, "eval_wikibio_pred_num_tokens": 105.5625, "eval_wikibio_rouge_score": 0.3278227519708147, "eval_wikibio_runtime": 35.395, "eval_wikibio_samples_per_second": 14.126, "eval_wikibio_steps_per_second": 0.028, "eval_wikibio_token_set_f1": 0.33872872478721905, "eval_wikibio_token_set_f1_sem": 0.012341346248656999, "eval_wikibio_token_set_precision": 0.30914575990080506, "eval_wikibio_token_set_recall": 0.40348300424586325, "eval_wikibio_true_num_tokens": 101.40625, "step": 1075000 }, { "epoch": 62.876528045856, "eval_msmarco_accuracy": 0.401359375, "eval_msmarco_bleu_score": 17.2329273453849, "eval_msmarco_bleu_score_sem": 1.5304922384821398, "eval_msmarco_emb_cos_sim": 0.7880241870880127, "eval_msmarco_emb_cos_sim_sem": 0.01577726937830448, "eval_msmarco_emb_top1_equal": 0.9765625, "eval_msmarco_emb_top1_equal_sem": 0.013424675911664963, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7092676162719727, "eval_msmarco_n_ngrams_match_1": 28.7734375, "eval_msmarco_n_ngrams_match_2": 13.3046875, "eval_msmarco_n_ngrams_match_3": 7.796875, "eval_msmarco_num_pred_words": 62.125, "eval_msmarco_num_true_words": 62.9140625, "eval_msmarco_perplexity": 5.524913638671527, "eval_msmarco_pred_num_tokens": 86.5859375, "eval_msmarco_rouge_score": 0.44576976663292867, "eval_msmarco_runtime": 29.0106, "eval_msmarco_samples_per_second": 17.235, "eval_msmarco_steps_per_second": 0.034, "eval_msmarco_token_set_f1": 0.47148128802906714, "eval_msmarco_token_set_f1_sem": 0.014551981433194727, "eval_msmarco_token_set_precision": 0.4357345539451754, "eval_msmarco_token_set_recall": 0.5372240205157292, "eval_msmarco_true_num_tokens": 81.453125, "step": 1075000 }, { "epoch": 62.8823770252091, "grad_norm": 0.22508233785629272, "learning_rate": 0.001, "loss": 1.7854, "step": 1075100 }, { "epoch": 62.888226004562206, "grad_norm": 0.1877170354127884, "learning_rate": 0.001, "loss": 1.7864, "step": 1075200 }, { "epoch": 62.89407498391531, "grad_norm": 0.16165706515312195, "learning_rate": 0.001, "loss": 1.7933, "step": 1075300 }, { "epoch": 62.89992396326841, "grad_norm": 0.25060737133026123, "learning_rate": 0.001, "loss": 1.7915, "step": 1075400 }, { "epoch": 62.90577294262151, "grad_norm": 0.21343998610973358, "learning_rate": 0.001, "loss": 1.7893, "step": 1075500 }, { "epoch": 62.911621921974614, "grad_norm": 0.1785614937543869, "learning_rate": 0.001, "loss": 1.789, "step": 1075600 }, { "epoch": 62.91747090132772, "grad_norm": 0.27940016984939575, "learning_rate": 0.001, "loss": 1.7894, "step": 1075700 }, { "epoch": 62.92331988068082, "grad_norm": 0.18845073878765106, "learning_rate": 0.001, "loss": 1.7936, "step": 1075800 }, { "epoch": 62.929168860033926, "grad_norm": 0.18531544506549835, "learning_rate": 0.001, "loss": 1.7909, "step": 1075900 }, { "epoch": 62.93501783938703, "grad_norm": 0.18648259341716766, "learning_rate": 0.001, "loss": 1.7905, "step": 1076000 }, { "epoch": 62.94086681874013, "grad_norm": 0.23726384341716766, "learning_rate": 0.001, "loss": 1.7899, "step": 1076100 }, { "epoch": 62.94671579809323, "grad_norm": 0.18440183997154236, "learning_rate": 0.001, "loss": 1.7965, "step": 1076200 }, { "epoch": 62.952564777446334, "grad_norm": 0.28543543815612793, "learning_rate": 0.001, "loss": 1.7924, "step": 1076300 }, { "epoch": 62.95841375679944, "grad_norm": 0.16363383829593658, "learning_rate": 0.001, "loss": 1.8029, "step": 1076400 }, { "epoch": 62.96426273615254, "grad_norm": 0.17258837819099426, "learning_rate": 0.001, "loss": 1.7915, "step": 1076500 }, { "epoch": 62.970111715505645, "grad_norm": 0.18310458958148956, "learning_rate": 0.001, "loss": 1.7918, "step": 1076600 }, { "epoch": 62.97596069485875, "grad_norm": 0.20100459456443787, "learning_rate": 0.001, "loss": 1.7899, "step": 1076700 }, { "epoch": 62.98180967421185, "grad_norm": 0.20067261159420013, "learning_rate": 0.001, "loss": 1.7871, "step": 1076800 }, { "epoch": 62.98765865356495, "grad_norm": 0.19169820845127106, "learning_rate": 0.001, "loss": 1.789, "step": 1076900 }, { "epoch": 62.99350763291805, "grad_norm": 0.18963845074176788, "learning_rate": 0.001, "loss": 1.7939, "step": 1077000 }, { "epoch": 62.99935661227116, "grad_norm": 0.17841556668281555, "learning_rate": 0.001, "loss": 1.7883, "step": 1077100 }, { "epoch": 63.00520559162426, "grad_norm": 0.2087872326374054, "learning_rate": 0.001, "loss": 1.7755, "step": 1077200 }, { "epoch": 63.011054570977365, "grad_norm": 0.16340015828609467, "learning_rate": 0.001, "loss": 1.777, "step": 1077300 }, { "epoch": 63.01690355033047, "grad_norm": 0.19031746685504913, "learning_rate": 0.001, "loss": 1.7708, "step": 1077400 }, { "epoch": 63.02275252968357, "grad_norm": 0.15847957134246826, "learning_rate": 0.001, "loss": 1.7778, "step": 1077500 }, { "epoch": 63.028601509036676, "grad_norm": 0.21547828614711761, "learning_rate": 0.001, "loss": 1.7749, "step": 1077600 }, { "epoch": 63.03445048838977, "grad_norm": 0.20617084205150604, "learning_rate": 0.001, "loss": 1.7872, "step": 1077700 }, { "epoch": 63.04029946774288, "grad_norm": 0.1592460423707962, "learning_rate": 0.001, "loss": 1.7745, "step": 1077800 }, { "epoch": 63.04614844709598, "grad_norm": 0.16094152629375458, "learning_rate": 0.001, "loss": 1.7811, "step": 1077900 }, { "epoch": 63.051997426449084, "grad_norm": 0.39792487025260925, "learning_rate": 0.001, "loss": 1.7788, "step": 1078000 }, { "epoch": 63.05784640580219, "grad_norm": 0.18557339906692505, "learning_rate": 0.001, "loss": 1.7802, "step": 1078100 }, { "epoch": 63.06369538515529, "grad_norm": 0.17075267434120178, "learning_rate": 0.001, "loss": 1.7765, "step": 1078200 }, { "epoch": 63.069544364508396, "grad_norm": 0.1855892688035965, "learning_rate": 0.001, "loss": 1.7861, "step": 1078300 }, { "epoch": 63.0753933438615, "grad_norm": 0.16307656466960907, "learning_rate": 0.001, "loss": 1.778, "step": 1078400 }, { "epoch": 63.081242323214596, "grad_norm": 0.27163493633270264, "learning_rate": 0.001, "loss": 1.7753, "step": 1078500 }, { "epoch": 63.0870913025677, "grad_norm": 0.2603698968887329, "learning_rate": 0.001, "loss": 1.7762, "step": 1078600 }, { "epoch": 63.092940281920804, "grad_norm": 0.15648214519023895, "learning_rate": 0.001, "loss": 1.7748, "step": 1078700 }, { "epoch": 63.09878926127391, "grad_norm": 0.2608214318752289, "learning_rate": 0.001, "loss": 1.7862, "step": 1078800 }, { "epoch": 63.10463824062701, "grad_norm": 0.1449958235025406, "learning_rate": 0.001, "loss": 1.7812, "step": 1078900 }, { "epoch": 63.110487219980115, "grad_norm": 0.18185292184352875, "learning_rate": 0.001, "loss": 1.7791, "step": 1079000 }, { "epoch": 63.11633619933322, "grad_norm": 0.1741584688425064, "learning_rate": 0.001, "loss": 1.7758, "step": 1079100 }, { "epoch": 63.12218517868632, "grad_norm": 0.1288091093301773, "learning_rate": 0.001, "loss": 1.7787, "step": 1079200 }, { "epoch": 63.12803415803942, "grad_norm": 0.23023812472820282, "learning_rate": 0.001, "loss": 1.7852, "step": 1079300 }, { "epoch": 63.13388313739252, "grad_norm": 0.19961324334144592, "learning_rate": 0.001, "loss": 1.7825, "step": 1079400 }, { "epoch": 63.13973211674563, "grad_norm": 0.18197940289974213, "learning_rate": 0.001, "loss": 1.782, "step": 1079500 }, { "epoch": 63.14558109609873, "grad_norm": 0.1738271266222, "learning_rate": 0.001, "loss": 1.7838, "step": 1079600 }, { "epoch": 63.151430075451835, "grad_norm": 0.1531350463628769, "learning_rate": 0.001, "loss": 1.7774, "step": 1079700 }, { "epoch": 63.15727905480494, "grad_norm": 0.126180961728096, "learning_rate": 0.001, "loss": 1.7777, "step": 1079800 }, { "epoch": 63.16312803415804, "grad_norm": 0.14334578812122345, "learning_rate": 0.001, "loss": 1.7775, "step": 1079900 }, { "epoch": 63.16897701351114, "grad_norm": 0.253510445356369, "learning_rate": 0.001, "loss": 1.7796, "step": 1080000 }, { "epoch": 63.17482599286424, "grad_norm": 0.27934184670448303, "learning_rate": 0.001, "loss": 1.7812, "step": 1080100 }, { "epoch": 63.180674972217346, "grad_norm": 0.24851982295513153, "learning_rate": 0.001, "loss": 1.7828, "step": 1080200 }, { "epoch": 63.18652395157045, "grad_norm": 0.2185906022787094, "learning_rate": 0.001, "loss": 1.7784, "step": 1080300 }, { "epoch": 63.192372930923554, "grad_norm": 0.17833182215690613, "learning_rate": 0.001, "loss": 1.7872, "step": 1080400 }, { "epoch": 63.19822191027666, "grad_norm": 0.19595380127429962, "learning_rate": 0.001, "loss": 1.7884, "step": 1080500 }, { "epoch": 63.20407088962976, "grad_norm": 0.1553599089384079, "learning_rate": 0.001, "loss": 1.7762, "step": 1080600 }, { "epoch": 63.209919868982865, "grad_norm": 0.17133349180221558, "learning_rate": 0.001, "loss": 1.7772, "step": 1080700 }, { "epoch": 63.21576884833596, "grad_norm": 0.15346340835094452, "learning_rate": 0.001, "loss": 1.7799, "step": 1080800 }, { "epoch": 63.221617827689066, "grad_norm": 0.15124861896038055, "learning_rate": 0.001, "loss": 1.7815, "step": 1080900 }, { "epoch": 63.22746680704217, "grad_norm": 0.1436769813299179, "learning_rate": 0.001, "loss": 1.7813, "step": 1081000 }, { "epoch": 63.23331578639527, "grad_norm": 0.19692283868789673, "learning_rate": 0.001, "loss": 1.7816, "step": 1081100 }, { "epoch": 63.23916476574838, "grad_norm": 0.2052643746137619, "learning_rate": 0.001, "loss": 1.7805, "step": 1081200 }, { "epoch": 63.24501374510148, "grad_norm": 0.18215101957321167, "learning_rate": 0.001, "loss": 1.7868, "step": 1081300 }, { "epoch": 63.250862724454585, "grad_norm": 0.16229136288166046, "learning_rate": 0.001, "loss": 1.7774, "step": 1081400 }, { "epoch": 63.25671170380769, "grad_norm": 0.17776523530483246, "learning_rate": 0.001, "loss": 1.7745, "step": 1081500 }, { "epoch": 63.262560683160785, "grad_norm": 0.19092200696468353, "learning_rate": 0.001, "loss": 1.7872, "step": 1081600 }, { "epoch": 63.26840966251389, "grad_norm": 0.20473238825798035, "learning_rate": 0.001, "loss": 1.7828, "step": 1081700 }, { "epoch": 63.27425864186699, "grad_norm": 0.19955745339393616, "learning_rate": 0.001, "loss": 1.7833, "step": 1081800 }, { "epoch": 63.2801076212201, "grad_norm": 0.1504485011100769, "learning_rate": 0.001, "loss": 1.7842, "step": 1081900 }, { "epoch": 63.2859566005732, "grad_norm": 0.2123221904039383, "learning_rate": 0.001, "loss": 1.7824, "step": 1082000 }, { "epoch": 63.291805579926304, "grad_norm": 0.1962304264307022, "learning_rate": 0.001, "loss": 1.7835, "step": 1082100 }, { "epoch": 63.29765455927941, "grad_norm": 0.17358067631721497, "learning_rate": 0.001, "loss": 1.7807, "step": 1082200 }, { "epoch": 63.30350353863251, "grad_norm": 0.13716191053390503, "learning_rate": 0.001, "loss": 1.7826, "step": 1082300 }, { "epoch": 63.30935251798561, "grad_norm": 0.20038580894470215, "learning_rate": 0.001, "loss": 1.7828, "step": 1082400 }, { "epoch": 63.31520149733871, "grad_norm": 0.2017480731010437, "learning_rate": 0.001, "loss": 1.7788, "step": 1082500 }, { "epoch": 63.321050476691816, "grad_norm": 0.22415518760681152, "learning_rate": 0.001, "loss": 1.7819, "step": 1082600 }, { "epoch": 63.32689945604492, "grad_norm": 0.22133417427539825, "learning_rate": 0.001, "loss": 1.7863, "step": 1082700 }, { "epoch": 63.332748435398024, "grad_norm": 0.17296889424324036, "learning_rate": 0.001, "loss": 1.7799, "step": 1082800 }, { "epoch": 63.33859741475113, "grad_norm": 0.1665416955947876, "learning_rate": 0.001, "loss": 1.7844, "step": 1082900 }, { "epoch": 63.34444639410423, "grad_norm": 0.15594521164894104, "learning_rate": 0.001, "loss": 1.7811, "step": 1083000 }, { "epoch": 63.35029537345733, "grad_norm": 0.18485532701015472, "learning_rate": 0.001, "loss": 1.7825, "step": 1083100 }, { "epoch": 63.35614435281043, "grad_norm": 0.21866151690483093, "learning_rate": 0.001, "loss": 1.779, "step": 1083200 }, { "epoch": 63.361993332163536, "grad_norm": 0.15614300966262817, "learning_rate": 0.001, "loss": 1.7843, "step": 1083300 }, { "epoch": 63.36784231151664, "grad_norm": 0.20470669865608215, "learning_rate": 0.001, "loss": 1.779, "step": 1083400 }, { "epoch": 63.37369129086974, "grad_norm": 0.2794627249240875, "learning_rate": 0.001, "loss": 1.7911, "step": 1083500 }, { "epoch": 63.37954027022285, "grad_norm": 0.19805794954299927, "learning_rate": 0.001, "loss": 1.7814, "step": 1083600 }, { "epoch": 63.38538924957595, "grad_norm": 0.27671584486961365, "learning_rate": 0.001, "loss": 1.7859, "step": 1083700 }, { "epoch": 63.391238228929055, "grad_norm": 0.17716683447360992, "learning_rate": 0.001, "loss": 1.7822, "step": 1083800 }, { "epoch": 63.39708720828215, "grad_norm": 0.255270391702652, "learning_rate": 0.001, "loss": 1.7899, "step": 1083900 }, { "epoch": 63.402936187635255, "grad_norm": 0.23474261164665222, "learning_rate": 0.001, "loss": 1.7873, "step": 1084000 }, { "epoch": 63.40878516698836, "grad_norm": 0.198452889919281, "learning_rate": 0.001, "loss": 1.7756, "step": 1084100 }, { "epoch": 63.41463414634146, "grad_norm": 0.17607152462005615, "learning_rate": 0.001, "loss": 1.7827, "step": 1084200 }, { "epoch": 63.42048312569457, "grad_norm": 0.20781779289245605, "learning_rate": 0.001, "loss": 1.7843, "step": 1084300 }, { "epoch": 63.42633210504767, "grad_norm": 0.1732039898633957, "learning_rate": 0.001, "loss": 1.7874, "step": 1084400 }, { "epoch": 63.432181084400774, "grad_norm": 0.12239833176136017, "learning_rate": 0.001, "loss": 1.7867, "step": 1084500 }, { "epoch": 63.43803006375388, "grad_norm": 0.18967615067958832, "learning_rate": 0.001, "loss": 1.7814, "step": 1084600 }, { "epoch": 63.443879043106975, "grad_norm": 0.17959599196910858, "learning_rate": 0.001, "loss": 1.7821, "step": 1084700 }, { "epoch": 63.44972802246008, "grad_norm": 0.13945864140987396, "learning_rate": 0.001, "loss": 1.7909, "step": 1084800 }, { "epoch": 63.45557700181318, "grad_norm": 0.17103978991508484, "learning_rate": 0.001, "loss": 1.7821, "step": 1084900 }, { "epoch": 63.461425981166286, "grad_norm": 0.13622546195983887, "learning_rate": 0.001, "loss": 1.7823, "step": 1085000 }, { "epoch": 63.46727496051939, "grad_norm": 0.16047126054763794, "learning_rate": 0.001, "loss": 1.7829, "step": 1085100 }, { "epoch": 63.473123939872494, "grad_norm": 0.1599205583333969, "learning_rate": 0.001, "loss": 1.7832, "step": 1085200 }, { "epoch": 63.4789729192256, "grad_norm": 0.1671553999185562, "learning_rate": 0.001, "loss": 1.7877, "step": 1085300 }, { "epoch": 63.4848218985787, "grad_norm": 0.16728264093399048, "learning_rate": 0.001, "loss": 1.7808, "step": 1085400 }, { "epoch": 63.4906708779318, "grad_norm": 0.13749320805072784, "learning_rate": 0.001, "loss": 1.7823, "step": 1085500 }, { "epoch": 63.4965198572849, "grad_norm": 0.1372915506362915, "learning_rate": 0.001, "loss": 1.7823, "step": 1085600 }, { "epoch": 63.502368836638006, "grad_norm": 0.20426414906978607, "learning_rate": 0.001, "loss": 1.7844, "step": 1085700 }, { "epoch": 63.50821781599111, "grad_norm": 0.18114419281482697, "learning_rate": 0.001, "loss": 1.784, "step": 1085800 }, { "epoch": 63.51406679534421, "grad_norm": 0.1686066836118698, "learning_rate": 0.001, "loss": 1.7877, "step": 1085900 }, { "epoch": 63.51991577469732, "grad_norm": 0.2750754952430725, "learning_rate": 0.001, "loss": 1.7882, "step": 1086000 }, { "epoch": 63.52576475405042, "grad_norm": 0.17453013360500336, "learning_rate": 0.001, "loss": 1.7862, "step": 1086100 }, { "epoch": 63.53161373340352, "grad_norm": 0.22919081151485443, "learning_rate": 0.001, "loss": 1.7807, "step": 1086200 }, { "epoch": 63.53746271275662, "grad_norm": 0.1665843427181244, "learning_rate": 0.001, "loss": 1.7834, "step": 1086300 }, { "epoch": 63.543311692109725, "grad_norm": 0.2019471973180771, "learning_rate": 0.001, "loss": 1.7911, "step": 1086400 }, { "epoch": 63.54916067146283, "grad_norm": 0.19234582781791687, "learning_rate": 0.001, "loss": 1.7925, "step": 1086500 }, { "epoch": 63.55500965081593, "grad_norm": 0.3199411928653717, "learning_rate": 0.001, "loss": 1.7854, "step": 1086600 }, { "epoch": 63.56085863016904, "grad_norm": 0.18252432346343994, "learning_rate": 0.001, "loss": 1.7852, "step": 1086700 }, { "epoch": 63.56670760952214, "grad_norm": 0.18358568847179413, "learning_rate": 0.001, "loss": 1.7805, "step": 1086800 }, { "epoch": 63.572556588875244, "grad_norm": 0.1904280185699463, "learning_rate": 0.001, "loss": 1.7883, "step": 1086900 }, { "epoch": 63.57840556822834, "grad_norm": 0.20650352537631989, "learning_rate": 0.001, "loss": 1.7811, "step": 1087000 }, { "epoch": 63.584254547581445, "grad_norm": 0.22916826605796814, "learning_rate": 0.001, "loss": 1.7817, "step": 1087100 }, { "epoch": 63.59010352693455, "grad_norm": 0.230648472905159, "learning_rate": 0.001, "loss": 1.7898, "step": 1087200 }, { "epoch": 63.59595250628765, "grad_norm": 0.20537297427654266, "learning_rate": 0.001, "loss": 1.7807, "step": 1087300 }, { "epoch": 63.601801485640756, "grad_norm": 0.1606421172618866, "learning_rate": 0.001, "loss": 1.7875, "step": 1087400 }, { "epoch": 63.60765046499386, "grad_norm": 0.18115463852882385, "learning_rate": 0.001, "loss": 1.7823, "step": 1087500 }, { "epoch": 63.613499444346964, "grad_norm": 0.14755181968212128, "learning_rate": 0.001, "loss": 1.7773, "step": 1087600 }, { "epoch": 63.61934842370007, "grad_norm": 0.22333666682243347, "learning_rate": 0.001, "loss": 1.7828, "step": 1087700 }, { "epoch": 63.625197403053164, "grad_norm": 0.17337918281555176, "learning_rate": 0.001, "loss": 1.7845, "step": 1087800 }, { "epoch": 63.63104638240627, "grad_norm": 0.14380629360675812, "learning_rate": 0.001, "loss": 1.7836, "step": 1087900 }, { "epoch": 63.63689536175937, "grad_norm": 0.21746009588241577, "learning_rate": 0.001, "loss": 1.7805, "step": 1088000 }, { "epoch": 63.642744341112476, "grad_norm": 0.16072095930576324, "learning_rate": 0.001, "loss": 1.7864, "step": 1088100 }, { "epoch": 63.64859332046558, "grad_norm": 0.1374565213918686, "learning_rate": 0.001, "loss": 1.7928, "step": 1088200 }, { "epoch": 63.65444229981868, "grad_norm": 0.21225255727767944, "learning_rate": 0.001, "loss": 1.7858, "step": 1088300 }, { "epoch": 63.66029127917179, "grad_norm": 0.13966888189315796, "learning_rate": 0.001, "loss": 1.7927, "step": 1088400 }, { "epoch": 63.66614025852489, "grad_norm": 0.2233794629573822, "learning_rate": 0.001, "loss": 1.7833, "step": 1088500 }, { "epoch": 63.67198923787799, "grad_norm": 0.2106112241744995, "learning_rate": 0.001, "loss": 1.7857, "step": 1088600 }, { "epoch": 63.67783821723109, "grad_norm": 0.14984440803527832, "learning_rate": 0.001, "loss": 1.7843, "step": 1088700 }, { "epoch": 63.683687196584195, "grad_norm": 0.23920588195323944, "learning_rate": 0.001, "loss": 1.7847, "step": 1088800 }, { "epoch": 63.6895361759373, "grad_norm": 0.15020525455474854, "learning_rate": 0.001, "loss": 1.7843, "step": 1088900 }, { "epoch": 63.6953851552904, "grad_norm": 0.26797160506248474, "learning_rate": 0.001, "loss": 1.7873, "step": 1089000 }, { "epoch": 63.70123413464351, "grad_norm": 0.128804549574852, "learning_rate": 0.001, "loss": 1.7891, "step": 1089100 }, { "epoch": 63.70708311399661, "grad_norm": 0.22675861418247223, "learning_rate": 0.001, "loss": 1.7903, "step": 1089200 }, { "epoch": 63.71293209334971, "grad_norm": 0.15609091520309448, "learning_rate": 0.001, "loss": 1.7848, "step": 1089300 }, { "epoch": 63.71878107270281, "grad_norm": 0.1557946801185608, "learning_rate": 0.001, "loss": 1.7892, "step": 1089400 }, { "epoch": 63.724630052055915, "grad_norm": 0.18451045453548431, "learning_rate": 0.001, "loss": 1.7889, "step": 1089500 }, { "epoch": 63.73047903140902, "grad_norm": 0.15679973363876343, "learning_rate": 0.001, "loss": 1.7888, "step": 1089600 }, { "epoch": 63.73632801076212, "grad_norm": 0.16345106065273285, "learning_rate": 0.001, "loss": 1.7886, "step": 1089700 }, { "epoch": 63.742176990115226, "grad_norm": 0.14478564262390137, "learning_rate": 0.001, "loss": 1.7893, "step": 1089800 }, { "epoch": 63.74802596946833, "grad_norm": 0.155181422829628, "learning_rate": 0.001, "loss": 1.7904, "step": 1089900 }, { "epoch": 63.753874948821434, "grad_norm": 0.2856277823448181, "learning_rate": 0.001, "loss": 1.7858, "step": 1090000 }, { "epoch": 63.75972392817453, "grad_norm": 0.2005413919687271, "learning_rate": 0.001, "loss": 1.7898, "step": 1090100 }, { "epoch": 63.765572907527634, "grad_norm": 0.2034231573343277, "learning_rate": 0.001, "loss": 1.7878, "step": 1090200 }, { "epoch": 63.77142188688074, "grad_norm": 0.20442216098308563, "learning_rate": 0.001, "loss": 1.7913, "step": 1090300 }, { "epoch": 63.77727086623384, "grad_norm": 0.13927598297595978, "learning_rate": 0.001, "loss": 1.7808, "step": 1090400 }, { "epoch": 63.783119845586945, "grad_norm": 0.17363788187503815, "learning_rate": 0.001, "loss": 1.7864, "step": 1090500 }, { "epoch": 63.78896882494005, "grad_norm": 0.1482969969511032, "learning_rate": 0.001, "loss": 1.7846, "step": 1090600 }, { "epoch": 63.79481780429315, "grad_norm": 0.14452950656414032, "learning_rate": 0.001, "loss": 1.7898, "step": 1090700 }, { "epoch": 63.80066678364626, "grad_norm": 0.22290945053100586, "learning_rate": 0.001, "loss": 1.788, "step": 1090800 }, { "epoch": 63.806515762999354, "grad_norm": 0.1365858018398285, "learning_rate": 0.001, "loss": 1.7843, "step": 1090900 }, { "epoch": 63.81236474235246, "grad_norm": 0.13661719858646393, "learning_rate": 0.001, "loss": 1.7829, "step": 1091000 }, { "epoch": 63.81821372170556, "grad_norm": 0.18645450472831726, "learning_rate": 0.001, "loss": 1.7837, "step": 1091100 }, { "epoch": 63.824062701058665, "grad_norm": 0.19855839014053345, "learning_rate": 0.001, "loss": 1.7865, "step": 1091200 }, { "epoch": 63.82991168041177, "grad_norm": 0.17808450758457184, "learning_rate": 0.001, "loss": 1.7859, "step": 1091300 }, { "epoch": 63.83576065976487, "grad_norm": 0.1535247266292572, "learning_rate": 0.001, "loss": 1.7877, "step": 1091400 }, { "epoch": 63.841609639117976, "grad_norm": 0.1469896286725998, "learning_rate": 0.001, "loss": 1.7874, "step": 1091500 }, { "epoch": 63.84745861847108, "grad_norm": 0.17326875030994415, "learning_rate": 0.001, "loss": 1.7887, "step": 1091600 }, { "epoch": 63.85330759782418, "grad_norm": 0.12498553842306137, "learning_rate": 0.001, "loss": 1.7878, "step": 1091700 }, { "epoch": 63.85915657717728, "grad_norm": 0.1685684621334076, "learning_rate": 0.001, "loss": 1.7847, "step": 1091800 }, { "epoch": 63.865005556530384, "grad_norm": 0.14909423887729645, "learning_rate": 0.001, "loss": 1.7875, "step": 1091900 }, { "epoch": 63.87085453588349, "grad_norm": 0.14216504991054535, "learning_rate": 0.001, "loss": 1.7883, "step": 1092000 }, { "epoch": 63.87670351523659, "grad_norm": 0.20461682975292206, "learning_rate": 0.001, "loss": 1.7849, "step": 1092100 }, { "epoch": 63.882552494589696, "grad_norm": 0.16139602661132812, "learning_rate": 0.001, "loss": 1.7867, "step": 1092200 }, { "epoch": 63.8884014739428, "grad_norm": 0.22902821004390717, "learning_rate": 0.001, "loss": 1.7861, "step": 1092300 }, { "epoch": 63.894250453295896, "grad_norm": 0.17474043369293213, "learning_rate": 0.001, "loss": 1.7889, "step": 1092400 }, { "epoch": 63.900099432649, "grad_norm": 0.1756260246038437, "learning_rate": 0.001, "loss": 1.7863, "step": 1092500 }, { "epoch": 63.905948412002104, "grad_norm": 0.19327029585838318, "learning_rate": 0.001, "loss": 1.7822, "step": 1092600 }, { "epoch": 63.91179739135521, "grad_norm": 0.13332374393939972, "learning_rate": 0.001, "loss": 1.78, "step": 1092700 }, { "epoch": 63.91764637070831, "grad_norm": 0.11886879056692123, "learning_rate": 0.001, "loss": 1.7961, "step": 1092800 }, { "epoch": 63.923495350061415, "grad_norm": 0.20919807255268097, "learning_rate": 0.001, "loss": 1.7862, "step": 1092900 }, { "epoch": 63.92934432941452, "grad_norm": 0.34567850828170776, "learning_rate": 0.001, "loss": 1.7914, "step": 1093000 }, { "epoch": 63.93519330876762, "grad_norm": 0.16534864902496338, "learning_rate": 0.001, "loss": 1.789, "step": 1093100 }, { "epoch": 63.94104228812072, "grad_norm": 0.19001246988773346, "learning_rate": 0.001, "loss": 1.7957, "step": 1093200 }, { "epoch": 63.94689126747382, "grad_norm": 0.19177119433879852, "learning_rate": 0.001, "loss": 1.7878, "step": 1093300 }, { "epoch": 63.95274024682693, "grad_norm": 0.13009938597679138, "learning_rate": 0.001, "loss": 1.7901, "step": 1093400 }, { "epoch": 63.95858922618003, "grad_norm": 0.16682069003582, "learning_rate": 0.001, "loss": 1.791, "step": 1093500 }, { "epoch": 63.964438205533135, "grad_norm": 0.1562301516532898, "learning_rate": 0.001, "loss": 1.7893, "step": 1093600 }, { "epoch": 63.97028718488624, "grad_norm": 0.15239599347114563, "learning_rate": 0.001, "loss": 1.7934, "step": 1093700 }, { "epoch": 63.97613616423934, "grad_norm": 0.17102599143981934, "learning_rate": 0.001, "loss": 1.7921, "step": 1093800 }, { "epoch": 63.981985143592446, "grad_norm": 0.20333804190158844, "learning_rate": 0.001, "loss": 1.7898, "step": 1093900 }, { "epoch": 63.98783412294554, "grad_norm": 0.1808238923549652, "learning_rate": 0.001, "loss": 1.79, "step": 1094000 }, { "epoch": 63.99368310229865, "grad_norm": 0.20759595930576324, "learning_rate": 0.001, "loss": 1.7891, "step": 1094100 }, { "epoch": 63.99953208165175, "grad_norm": 0.1524360477924347, "learning_rate": 0.001, "loss": 1.7919, "step": 1094200 }, { "epoch": 64.00538106100485, "grad_norm": 0.19086793065071106, "learning_rate": 0.001, "loss": 1.7852, "step": 1094300 }, { "epoch": 64.01123004035796, "grad_norm": 0.18235982954502106, "learning_rate": 0.001, "loss": 1.7747, "step": 1094400 }, { "epoch": 64.01707901971106, "grad_norm": 0.17272301018238068, "learning_rate": 0.001, "loss": 1.7695, "step": 1094500 }, { "epoch": 64.02292799906417, "grad_norm": 0.1413361132144928, "learning_rate": 0.001, "loss": 1.7735, "step": 1094600 }, { "epoch": 64.02877697841727, "grad_norm": 0.181893453001976, "learning_rate": 0.001, "loss": 1.7771, "step": 1094700 }, { "epoch": 64.03462595777037, "grad_norm": 0.22768396139144897, "learning_rate": 0.001, "loss": 1.7749, "step": 1094800 }, { "epoch": 64.04047493712348, "grad_norm": 0.21559631824493408, "learning_rate": 0.001, "loss": 1.7734, "step": 1094900 }, { "epoch": 64.04632391647658, "grad_norm": 0.3227751851081848, "learning_rate": 0.001, "loss": 1.777, "step": 1095000 }, { "epoch": 64.05217289582968, "grad_norm": 0.19828617572784424, "learning_rate": 0.001, "loss": 1.7825, "step": 1095100 }, { "epoch": 64.05802187518277, "grad_norm": 0.14595623314380646, "learning_rate": 0.001, "loss": 1.7773, "step": 1095200 }, { "epoch": 64.06387085453588, "grad_norm": 0.19532416760921478, "learning_rate": 0.001, "loss": 1.7793, "step": 1095300 }, { "epoch": 64.06971983388898, "grad_norm": 0.15657885372638702, "learning_rate": 0.001, "loss": 1.7805, "step": 1095400 }, { "epoch": 64.07556881324209, "grad_norm": 0.20931941270828247, "learning_rate": 0.001, "loss": 1.7751, "step": 1095500 }, { "epoch": 64.08141779259519, "grad_norm": 0.22721004486083984, "learning_rate": 0.001, "loss": 1.7817, "step": 1095600 }, { "epoch": 64.0872667719483, "grad_norm": 0.1513659507036209, "learning_rate": 0.001, "loss": 1.7795, "step": 1095700 }, { "epoch": 64.0931157513014, "grad_norm": 0.16597814857959747, "learning_rate": 0.001, "loss": 1.7767, "step": 1095800 }, { "epoch": 64.0989647306545, "grad_norm": 0.2030152529478073, "learning_rate": 0.001, "loss": 1.784, "step": 1095900 }, { "epoch": 64.1048137100076, "grad_norm": 0.17447450757026672, "learning_rate": 0.001, "loss": 1.7754, "step": 1096000 }, { "epoch": 64.11066268936071, "grad_norm": 0.16330286860466003, "learning_rate": 0.001, "loss": 1.7718, "step": 1096100 }, { "epoch": 64.11651166871381, "grad_norm": 0.14432969689369202, "learning_rate": 0.001, "loss": 1.7736, "step": 1096200 }, { "epoch": 64.12236064806692, "grad_norm": 0.15861742198467255, "learning_rate": 0.001, "loss": 1.7775, "step": 1096300 }, { "epoch": 64.12820962742002, "grad_norm": 0.20143041014671326, "learning_rate": 0.001, "loss": 1.7762, "step": 1096400 }, { "epoch": 64.13405860677312, "grad_norm": 0.2072136104106903, "learning_rate": 0.001, "loss": 1.7823, "step": 1096500 }, { "epoch": 64.13990758612623, "grad_norm": 0.19487915933132172, "learning_rate": 0.001, "loss": 1.7775, "step": 1096600 }, { "epoch": 64.14575656547932, "grad_norm": 0.14030896127223969, "learning_rate": 0.001, "loss": 1.7828, "step": 1096700 }, { "epoch": 64.15160554483242, "grad_norm": 0.1735881268978119, "learning_rate": 0.001, "loss": 1.782, "step": 1096800 }, { "epoch": 64.15745452418552, "grad_norm": 0.17590123414993286, "learning_rate": 0.001, "loss": 1.7798, "step": 1096900 }, { "epoch": 64.16330350353863, "grad_norm": 0.16790813207626343, "learning_rate": 0.001, "loss": 1.777, "step": 1097000 }, { "epoch": 64.16915248289173, "grad_norm": 0.20667320489883423, "learning_rate": 0.001, "loss": 1.7734, "step": 1097100 }, { "epoch": 64.17500146224484, "grad_norm": 0.24989725649356842, "learning_rate": 0.001, "loss": 1.7783, "step": 1097200 }, { "epoch": 64.18085044159794, "grad_norm": 0.1933876872062683, "learning_rate": 0.001, "loss": 1.7801, "step": 1097300 }, { "epoch": 64.18669942095104, "grad_norm": 0.252401739358902, "learning_rate": 0.001, "loss": 1.7799, "step": 1097400 }, { "epoch": 64.19254840030415, "grad_norm": 0.19278432428836823, "learning_rate": 0.001, "loss": 1.7766, "step": 1097500 }, { "epoch": 64.19839737965725, "grad_norm": 0.17553259432315826, "learning_rate": 0.001, "loss": 1.779, "step": 1097600 }, { "epoch": 64.20424635901036, "grad_norm": 0.19899515807628632, "learning_rate": 0.001, "loss": 1.7808, "step": 1097700 }, { "epoch": 64.21009533836346, "grad_norm": 0.14558587968349457, "learning_rate": 0.001, "loss": 1.7792, "step": 1097800 }, { "epoch": 64.21594431771656, "grad_norm": 0.21356715261936188, "learning_rate": 0.001, "loss": 1.7829, "step": 1097900 }, { "epoch": 64.22179329706967, "grad_norm": 0.18620198965072632, "learning_rate": 0.001, "loss": 1.7729, "step": 1098000 }, { "epoch": 64.22764227642277, "grad_norm": 0.1912640929222107, "learning_rate": 0.001, "loss": 1.7774, "step": 1098100 }, { "epoch": 64.23349125577587, "grad_norm": 0.13809698820114136, "learning_rate": 0.001, "loss": 1.7802, "step": 1098200 }, { "epoch": 64.23934023512896, "grad_norm": 0.2058824896812439, "learning_rate": 0.001, "loss": 1.7803, "step": 1098300 }, { "epoch": 64.24518921448207, "grad_norm": 0.173777237534523, "learning_rate": 0.001, "loss": 1.7758, "step": 1098400 }, { "epoch": 64.25103819383517, "grad_norm": 0.20084118843078613, "learning_rate": 0.001, "loss": 1.7757, "step": 1098500 }, { "epoch": 64.25688717318828, "grad_norm": 0.15839624404907227, "learning_rate": 0.001, "loss": 1.7778, "step": 1098600 }, { "epoch": 64.26273615254138, "grad_norm": 0.15381887555122375, "learning_rate": 0.001, "loss": 1.7828, "step": 1098700 }, { "epoch": 64.26858513189448, "grad_norm": 0.17819470167160034, "learning_rate": 0.001, "loss": 1.7811, "step": 1098800 }, { "epoch": 64.27443411124759, "grad_norm": 0.24150672554969788, "learning_rate": 0.001, "loss": 1.7797, "step": 1098900 }, { "epoch": 64.28028309060069, "grad_norm": 0.22177810966968536, "learning_rate": 0.001, "loss": 1.7821, "step": 1099000 }, { "epoch": 64.2861320699538, "grad_norm": 0.18127377331256866, "learning_rate": 0.001, "loss": 1.7776, "step": 1099100 }, { "epoch": 64.2919810493069, "grad_norm": 0.16763119399547577, "learning_rate": 0.001, "loss": 1.7768, "step": 1099200 }, { "epoch": 64.29783002866, "grad_norm": 0.2220841497182846, "learning_rate": 0.001, "loss": 1.7762, "step": 1099300 }, { "epoch": 64.3036790080131, "grad_norm": 0.15465319156646729, "learning_rate": 0.001, "loss": 1.7824, "step": 1099400 }, { "epoch": 64.30952798736621, "grad_norm": 0.22340817749500275, "learning_rate": 0.001, "loss": 1.7837, "step": 1099500 }, { "epoch": 64.31537696671931, "grad_norm": 0.19109594821929932, "learning_rate": 0.001, "loss": 1.7817, "step": 1099600 }, { "epoch": 64.32122594607242, "grad_norm": 0.2519729733467102, "learning_rate": 0.001, "loss": 1.7779, "step": 1099700 }, { "epoch": 64.3270749254255, "grad_norm": 0.1680735945701599, "learning_rate": 0.001, "loss": 1.778, "step": 1099800 }, { "epoch": 64.33292390477861, "grad_norm": 0.14346186816692352, "learning_rate": 0.001, "loss": 1.7785, "step": 1099900 }, { "epoch": 64.33877288413171, "grad_norm": 0.14032799005508423, "learning_rate": 0.001, "loss": 1.7865, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_ag_news_accuracy": 0.245875, "eval_ag_news_bleu_score": 7.910122330941473, "eval_ag_news_bleu_score_sem": 0.6710190398051893, "eval_ag_news_emb_cos_sim": 0.7164015769958496, "eval_ag_news_emb_cos_sim_sem": 0.012850501574575901, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.677503824234009, "eval_ag_news_n_ngrams_match_1": 14.28125, "eval_ag_news_n_ngrams_match_2": 4.484375, "eval_ag_news_n_ngrams_match_3": 1.8515625, "eval_ag_news_num_pred_words": 46.2421875, "eval_ag_news_num_true_words": 44.6640625, "eval_ag_news_perplexity": 14.548731740747673, "eval_ag_news_pred_num_tokens": 71.140625, "eval_ag_news_rouge_score": 0.3036263333730098, "eval_ag_news_runtime": 39.3143, "eval_ag_news_samples_per_second": 12.718, "eval_ag_news_steps_per_second": 0.025, "eval_ag_news_token_set_f1": 0.33890577756750156, "eval_ag_news_token_set_f1_sem": 0.011026789832644907, "eval_ag_news_token_set_precision": 0.3239975263405971, "eval_ag_news_token_set_recall": 0.3659231316563966, "eval_ag_news_true_num_tokens": 61.7421875, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_anthropic_toxic_prompts_accuracy": 0.10246875, "eval_anthropic_toxic_prompts_bleu_score": 38.52356761000085, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.583572901904783, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.8823449611663818, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009559271857142448, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.125, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02934655810211727, "eval_anthropic_toxic_prompts_loss": 1.239019513130188, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.6640625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.6171875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.4375, "eval_anthropic_toxic_prompts_num_pred_words": 16.2109375, "eval_anthropic_toxic_prompts_num_true_words": 15.8125, "eval_anthropic_toxic_prompts_perplexity": 3.4522269416253697, "eval_anthropic_toxic_prompts_pred_num_tokens": 20.6875, "eval_anthropic_toxic_prompts_rouge_score": 0.6728376720057064, "eval_anthropic_toxic_prompts_runtime": 34.8892, "eval_anthropic_toxic_prompts_samples_per_second": 14.331, "eval_anthropic_toxic_prompts_steps_per_second": 0.029, "eval_anthropic_toxic_prompts_token_set_f1": 0.6941547889197031, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.01745658560411224, "eval_anthropic_toxic_prompts_token_set_precision": 0.692230545371044, "eval_anthropic_toxic_prompts_token_set_recall": 0.7016968699521757, "eval_anthropic_toxic_prompts_true_num_tokens": 19.453125, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_arxiv_accuracy": 0.379671875, "eval_arxiv_bleu_score": 1.8964770273758689, "eval_arxiv_bleu_score_sem": 0.1668771037401173, "eval_arxiv_emb_cos_sim": 0.46661901473999023, "eval_arxiv_emb_cos_sim_sem": 0.018813591450452805, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4134604930877686, "eval_arxiv_n_ngrams_match_1": 13.6796875, "eval_arxiv_n_ngrams_match_2": 2.5703125, "eval_arxiv_n_ngrams_match_3": 0.578125, "eval_arxiv_num_pred_words": 57.484375, "eval_arxiv_num_true_words": 85.90625, "eval_arxiv_perplexity": 30.370158350507296, "eval_arxiv_pred_num_tokens": 125.703125, "eval_arxiv_rouge_score": 0.17808865053254758, "eval_arxiv_runtime": 34.9946, "eval_arxiv_samples_per_second": 14.288, "eval_arxiv_steps_per_second": 0.029, "eval_arxiv_token_set_f1": 0.182760079981563, "eval_arxiv_token_set_f1_sem": 0.008373365202581393, "eval_arxiv_token_set_precision": 0.12617523700750902, "eval_arxiv_token_set_recall": 0.4148749229623302, "eval_arxiv_true_num_tokens": 125.609375, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_python_code_alpaca_accuracy": 0.131671875, "eval_python_code_alpaca_bleu_score": 29.175479062475013, "eval_python_code_alpaca_bleu_score_sem": 1.701749304549614, "eval_python_code_alpaca_emb_cos_sim": 0.8636412620544434, "eval_python_code_alpaca_emb_cos_sim_sem": 0.010359623469412327, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0078125, "eval_python_code_alpaca_exact_match_sem": 0.0078125, "eval_python_code_alpaca_loss": 1.4707430601119995, "eval_python_code_alpaca_n_ngrams_match_1": 11.1953125, "eval_python_code_alpaca_n_ngrams_match_2": 6.1171875, "eval_python_code_alpaca_n_ngrams_match_3": 3.65625, "eval_python_code_alpaca_num_pred_words": 18.9375, "eval_python_code_alpaca_num_true_words": 19.1171875, "eval_python_code_alpaca_perplexity": 4.352468085200935, "eval_python_code_alpaca_pred_num_tokens": 24.71875, "eval_python_code_alpaca_rouge_score": 0.6035878165741559, "eval_python_code_alpaca_runtime": 35.0946, "eval_python_code_alpaca_samples_per_second": 14.247, "eval_python_code_alpaca_steps_per_second": 0.028, "eval_python_code_alpaca_token_set_f1": 0.6270943028206054, "eval_python_code_alpaca_token_set_f1_sem": 0.013621923734523023, "eval_python_code_alpaca_token_set_precision": 0.6188153221369453, "eval_python_code_alpaca_token_set_recall": 0.6403693947269352, "eval_python_code_alpaca_true_num_tokens": 24.6328125, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_wikibio_accuracy": 0.372375, "eval_wikibio_bleu_score": 7.298589247889197, "eval_wikibio_bleu_score_sem": 0.6813419005969312, "eval_wikibio_emb_cos_sim": 0.5965741872787476, "eval_wikibio_emb_cos_sim_sem": 0.02265981025993824, "eval_wikibio_emb_top1_equal": 0.9140625, "eval_wikibio_emb_top1_equal_sem": 0.024870097637176514, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.7013044357299805, "eval_wikibio_n_ngrams_match_1": 14.328125, "eval_wikibio_n_ngrams_match_2": 4.8359375, "eval_wikibio_n_ngrams_match_3": 2.078125, "eval_wikibio_num_pred_words": 51.2734375, "eval_wikibio_num_true_words": 52.3984375, "eval_wikibio_perplexity": 14.899154043416408, "eval_wikibio_pred_num_tokens": 107.015625, "eval_wikibio_rouge_score": 0.2832104945574394, "eval_wikibio_runtime": 37.1294, "eval_wikibio_samples_per_second": 13.466, "eval_wikibio_steps_per_second": 0.027, "eval_wikibio_token_set_f1": 0.30502627624116585, "eval_wikibio_token_set_f1_sem": 0.012643413622282502, "eval_wikibio_token_set_precision": 0.2723066116953017, "eval_wikibio_token_set_recall": 0.39431376272069274, "eval_wikibio_true_num_tokens": 101.2734375, "step": 1100000 }, { "epoch": 64.33877288413171, "eval_msmarco_accuracy": 0.3958125, "eval_msmarco_bleu_score": 17.77064898659385, "eval_msmarco_bleu_score_sem": 1.5230552371857058, "eval_msmarco_emb_cos_sim": 0.793936014175415, "eval_msmarco_emb_cos_sim_sem": 0.017073174938559532, "eval_msmarco_emb_top1_equal": 0.953125, "eval_msmarco_emb_top1_equal_sem": 0.01875615119934082, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.703170657157898, "eval_msmarco_n_ngrams_match_1": 27.6875, "eval_msmarco_n_ngrams_match_2": 12.546875, "eval_msmarco_n_ngrams_match_3": 7.2421875, "eval_msmarco_num_pred_words": 60.859375, "eval_msmarco_num_true_words": 61.5390625, "eval_msmarco_perplexity": 5.491330946289421, "eval_msmarco_pred_num_tokens": 85.125, "eval_msmarco_rouge_score": 0.43803314458339115, "eval_msmarco_runtime": 30.0388, "eval_msmarco_samples_per_second": 16.645, "eval_msmarco_steps_per_second": 0.033, "eval_msmarco_token_set_f1": 0.4652949775906337, "eval_msmarco_token_set_f1_sem": 0.01502119455087476, "eval_msmarco_token_set_precision": 0.4309371996639407, "eval_msmarco_token_set_recall": 0.5310305958658251, "eval_msmarco_true_num_tokens": 80.6875, "step": 1100000 }, { "epoch": 64.34462186348482, "grad_norm": 0.19828230142593384, "learning_rate": 0.001, "loss": 1.783, "step": 1100100 }, { "epoch": 64.35047084283792, "grad_norm": 0.26229411363601685, "learning_rate": 0.001, "loss": 1.7854, "step": 1100200 }, { "epoch": 64.35631982219103, "grad_norm": 0.15230515599250793, "learning_rate": 0.001, "loss": 1.7801, "step": 1100300 }, { "epoch": 64.36216880154413, "grad_norm": 0.16343995928764343, "learning_rate": 0.001, "loss": 1.7824, "step": 1100400 }, { "epoch": 64.36801778089723, "grad_norm": 0.15332069993019104, "learning_rate": 0.001, "loss": 1.7821, "step": 1100500 }, { "epoch": 64.37386676025034, "grad_norm": 0.19404929876327515, "learning_rate": 0.001, "loss": 1.7834, "step": 1100600 }, { "epoch": 64.37971573960344, "grad_norm": 0.1421777307987213, "learning_rate": 0.001, "loss": 1.7819, "step": 1100700 }, { "epoch": 64.38556471895654, "grad_norm": 0.19944354891777039, "learning_rate": 0.001, "loss": 1.78, "step": 1100800 }, { "epoch": 64.39141369830965, "grad_norm": 0.19762179255485535, "learning_rate": 0.001, "loss": 1.7828, "step": 1100900 }, { "epoch": 64.39726267766275, "grad_norm": 0.17945311963558197, "learning_rate": 0.001, "loss": 1.7775, "step": 1101000 }, { "epoch": 64.40311165701586, "grad_norm": 0.16798758506774902, "learning_rate": 0.001, "loss": 1.7835, "step": 1101100 }, { "epoch": 64.40896063636896, "grad_norm": 0.17920620739459991, "learning_rate": 0.001, "loss": 1.783, "step": 1101200 }, { "epoch": 64.41480961572206, "grad_norm": 0.16679143905639648, "learning_rate": 0.001, "loss": 1.7853, "step": 1101300 }, { "epoch": 64.42065859507515, "grad_norm": 0.15093645453453064, "learning_rate": 0.001, "loss": 1.7748, "step": 1101400 }, { "epoch": 64.42650757442826, "grad_norm": 0.1685737669467926, "learning_rate": 0.001, "loss": 1.7848, "step": 1101500 }, { "epoch": 64.43235655378136, "grad_norm": 0.22817650437355042, "learning_rate": 0.001, "loss": 1.7864, "step": 1101600 }, { "epoch": 64.43820553313446, "grad_norm": 0.18726252019405365, "learning_rate": 0.001, "loss": 1.7836, "step": 1101700 }, { "epoch": 64.44405451248757, "grad_norm": 0.14815759658813477, "learning_rate": 0.001, "loss": 1.7789, "step": 1101800 }, { "epoch": 64.44990349184067, "grad_norm": 0.1565937101840973, "learning_rate": 0.001, "loss": 1.7847, "step": 1101900 }, { "epoch": 64.45575247119378, "grad_norm": 0.24798992276191711, "learning_rate": 0.001, "loss": 1.7775, "step": 1102000 }, { "epoch": 64.46160145054688, "grad_norm": 0.19258032739162445, "learning_rate": 0.001, "loss": 1.7896, "step": 1102100 }, { "epoch": 64.46745042989998, "grad_norm": 0.14790531992912292, "learning_rate": 0.001, "loss": 1.7878, "step": 1102200 }, { "epoch": 64.47329940925309, "grad_norm": 0.13770680129528046, "learning_rate": 0.001, "loss": 1.7831, "step": 1102300 }, { "epoch": 64.47914838860619, "grad_norm": 0.25133979320526123, "learning_rate": 0.001, "loss": 1.7778, "step": 1102400 }, { "epoch": 64.4849973679593, "grad_norm": 0.1554495096206665, "learning_rate": 0.001, "loss": 1.7813, "step": 1102500 }, { "epoch": 64.4908463473124, "grad_norm": 0.18511125445365906, "learning_rate": 0.001, "loss": 1.7833, "step": 1102600 }, { "epoch": 64.4966953266655, "grad_norm": 0.2368878722190857, "learning_rate": 0.001, "loss": 1.7841, "step": 1102700 }, { "epoch": 64.5025443060186, "grad_norm": 0.14684225618839264, "learning_rate": 0.001, "loss": 1.7868, "step": 1102800 }, { "epoch": 64.5083932853717, "grad_norm": 0.17882896959781647, "learning_rate": 0.001, "loss": 1.7833, "step": 1102900 }, { "epoch": 64.5142422647248, "grad_norm": 0.16792726516723633, "learning_rate": 0.001, "loss": 1.7821, "step": 1103000 }, { "epoch": 64.5200912440779, "grad_norm": 0.16530276834964752, "learning_rate": 0.001, "loss": 1.7834, "step": 1103100 }, { "epoch": 64.52594022343101, "grad_norm": 0.14938868582248688, "learning_rate": 0.001, "loss": 1.7826, "step": 1103200 }, { "epoch": 64.53178920278411, "grad_norm": 0.19389161467552185, "learning_rate": 0.001, "loss": 1.7785, "step": 1103300 }, { "epoch": 64.53763818213721, "grad_norm": 0.15175439417362213, "learning_rate": 0.001, "loss": 1.7808, "step": 1103400 }, { "epoch": 64.54348716149032, "grad_norm": 0.1928180605173111, "learning_rate": 0.001, "loss": 1.7841, "step": 1103500 }, { "epoch": 64.54933614084342, "grad_norm": 0.20571660995483398, "learning_rate": 0.001, "loss": 1.7868, "step": 1103600 }, { "epoch": 64.55518512019653, "grad_norm": 0.20472456514835358, "learning_rate": 0.001, "loss": 1.785, "step": 1103700 }, { "epoch": 64.56103409954963, "grad_norm": 0.17968451976776123, "learning_rate": 0.001, "loss": 1.7833, "step": 1103800 }, { "epoch": 64.56688307890273, "grad_norm": 0.180664524435997, "learning_rate": 0.001, "loss": 1.7879, "step": 1103900 }, { "epoch": 64.57273205825584, "grad_norm": 0.19137173891067505, "learning_rate": 0.001, "loss": 1.7792, "step": 1104000 }, { "epoch": 64.57858103760894, "grad_norm": 0.188410222530365, "learning_rate": 0.001, "loss": 1.7863, "step": 1104100 }, { "epoch": 64.58443001696205, "grad_norm": 0.18977735936641693, "learning_rate": 0.001, "loss": 1.7797, "step": 1104200 }, { "epoch": 64.59027899631515, "grad_norm": 0.2168736606836319, "learning_rate": 0.001, "loss": 1.7845, "step": 1104300 }, { "epoch": 64.59612797566825, "grad_norm": 0.1665458381175995, "learning_rate": 0.001, "loss": 1.7783, "step": 1104400 }, { "epoch": 64.60197695502134, "grad_norm": 0.15825411677360535, "learning_rate": 0.001, "loss": 1.7796, "step": 1104500 }, { "epoch": 64.60782593437445, "grad_norm": 0.18986958265304565, "learning_rate": 0.001, "loss": 1.7879, "step": 1104600 }, { "epoch": 64.61367491372755, "grad_norm": 0.19355256855487823, "learning_rate": 0.001, "loss": 1.7885, "step": 1104700 }, { "epoch": 64.61952389308065, "grad_norm": 0.33224090933799744, "learning_rate": 0.001, "loss": 1.7868, "step": 1104800 }, { "epoch": 64.62537287243376, "grad_norm": 0.1983260214328766, "learning_rate": 0.001, "loss": 1.7858, "step": 1104900 }, { "epoch": 64.63122185178686, "grad_norm": 0.14165958762168884, "learning_rate": 0.001, "loss": 1.7802, "step": 1105000 }, { "epoch": 64.63707083113997, "grad_norm": 0.24656446278095245, "learning_rate": 0.001, "loss": 1.782, "step": 1105100 }, { "epoch": 64.64291981049307, "grad_norm": 0.16143305599689484, "learning_rate": 0.001, "loss": 1.7817, "step": 1105200 }, { "epoch": 64.64876878984617, "grad_norm": 0.16870248317718506, "learning_rate": 0.001, "loss": 1.7825, "step": 1105300 }, { "epoch": 64.65461776919928, "grad_norm": 0.16689175367355347, "learning_rate": 0.001, "loss": 1.786, "step": 1105400 }, { "epoch": 64.66046674855238, "grad_norm": 0.21650634706020355, "learning_rate": 0.001, "loss": 1.787, "step": 1105500 }, { "epoch": 64.66631572790548, "grad_norm": 0.1594105213880539, "learning_rate": 0.001, "loss": 1.7853, "step": 1105600 }, { "epoch": 64.67216470725859, "grad_norm": 0.17373257875442505, "learning_rate": 0.001, "loss": 1.7808, "step": 1105700 }, { "epoch": 64.67801368661169, "grad_norm": 0.2212027758359909, "learning_rate": 0.001, "loss": 1.7807, "step": 1105800 }, { "epoch": 64.6838626659648, "grad_norm": 0.23856142163276672, "learning_rate": 0.001, "loss": 1.7856, "step": 1105900 }, { "epoch": 64.68971164531789, "grad_norm": 0.2040112167596817, "learning_rate": 0.001, "loss": 1.785, "step": 1106000 }, { "epoch": 64.69556062467099, "grad_norm": 0.18996812403202057, "learning_rate": 0.001, "loss": 1.7844, "step": 1106100 }, { "epoch": 64.7014096040241, "grad_norm": 0.18972009420394897, "learning_rate": 0.001, "loss": 1.7886, "step": 1106200 }, { "epoch": 64.7072585833772, "grad_norm": 0.2092307209968567, "learning_rate": 0.001, "loss": 1.7936, "step": 1106300 }, { "epoch": 64.7131075627303, "grad_norm": 0.18330591917037964, "learning_rate": 0.001, "loss": 1.791, "step": 1106400 }, { "epoch": 64.7189565420834, "grad_norm": 0.2022649198770523, "learning_rate": 0.001, "loss": 1.7888, "step": 1106500 }, { "epoch": 64.72480552143651, "grad_norm": 0.14972300827503204, "learning_rate": 0.001, "loss": 1.7809, "step": 1106600 }, { "epoch": 64.73065450078961, "grad_norm": 0.2808481752872467, "learning_rate": 0.001, "loss": 1.78, "step": 1106700 }, { "epoch": 64.73650348014272, "grad_norm": 0.1456676423549652, "learning_rate": 0.001, "loss": 1.7862, "step": 1106800 }, { "epoch": 64.74235245949582, "grad_norm": 0.16072015464305878, "learning_rate": 0.001, "loss": 1.7863, "step": 1106900 }, { "epoch": 64.74820143884892, "grad_norm": 0.20811933279037476, "learning_rate": 0.001, "loss": 1.784, "step": 1107000 }, { "epoch": 64.75405041820203, "grad_norm": 0.1716119796037674, "learning_rate": 0.001, "loss": 1.7847, "step": 1107100 }, { "epoch": 64.75989939755513, "grad_norm": 0.16259650886058807, "learning_rate": 0.001, "loss": 1.7866, "step": 1107200 }, { "epoch": 64.76574837690823, "grad_norm": 0.23391248285770416, "learning_rate": 0.001, "loss": 1.7899, "step": 1107300 }, { "epoch": 64.77159735626134, "grad_norm": 0.29320669174194336, "learning_rate": 0.001, "loss": 1.7839, "step": 1107400 }, { "epoch": 64.77744633561444, "grad_norm": 0.1972796618938446, "learning_rate": 0.001, "loss": 1.7864, "step": 1107500 }, { "epoch": 64.78329531496753, "grad_norm": 0.1427360326051712, "learning_rate": 0.001, "loss": 1.7846, "step": 1107600 }, { "epoch": 64.78914429432064, "grad_norm": 0.16221944987773895, "learning_rate": 0.001, "loss": 1.7919, "step": 1107700 }, { "epoch": 64.79499327367374, "grad_norm": 0.20910756289958954, "learning_rate": 0.001, "loss": 1.7884, "step": 1107800 }, { "epoch": 64.80084225302684, "grad_norm": 0.17997246980667114, "learning_rate": 0.001, "loss": 1.7839, "step": 1107900 }, { "epoch": 64.80669123237995, "grad_norm": 0.20334893465042114, "learning_rate": 0.001, "loss": 1.7811, "step": 1108000 }, { "epoch": 64.81254021173305, "grad_norm": 0.17124296724796295, "learning_rate": 0.001, "loss": 1.7831, "step": 1108100 }, { "epoch": 64.81838919108615, "grad_norm": 0.21810424327850342, "learning_rate": 0.001, "loss": 1.7831, "step": 1108200 }, { "epoch": 64.82423817043926, "grad_norm": 0.2298131138086319, "learning_rate": 0.001, "loss": 1.7887, "step": 1108300 }, { "epoch": 64.83008714979236, "grad_norm": 0.17741850018501282, "learning_rate": 0.001, "loss": 1.7843, "step": 1108400 }, { "epoch": 64.83593612914547, "grad_norm": 0.16457471251487732, "learning_rate": 0.001, "loss": 1.7841, "step": 1108500 }, { "epoch": 64.84178510849857, "grad_norm": 0.16880063712596893, "learning_rate": 0.001, "loss": 1.7889, "step": 1108600 }, { "epoch": 64.84763408785167, "grad_norm": 0.3119845390319824, "learning_rate": 0.001, "loss": 1.7893, "step": 1108700 }, { "epoch": 64.85348306720478, "grad_norm": 0.1670072227716446, "learning_rate": 0.001, "loss": 1.7892, "step": 1108800 }, { "epoch": 64.85933204655788, "grad_norm": 0.23765911161899567, "learning_rate": 0.001, "loss": 1.7898, "step": 1108900 }, { "epoch": 64.86518102591099, "grad_norm": 0.2272341251373291, "learning_rate": 0.001, "loss": 1.7852, "step": 1109000 }, { "epoch": 64.87103000526407, "grad_norm": 0.1965077519416809, "learning_rate": 0.001, "loss": 1.7877, "step": 1109100 }, { "epoch": 64.87687898461718, "grad_norm": 0.16759084165096283, "learning_rate": 0.001, "loss": 1.7839, "step": 1109200 }, { "epoch": 64.88272796397028, "grad_norm": 0.15908436477184296, "learning_rate": 0.001, "loss": 1.7842, "step": 1109300 }, { "epoch": 64.88857694332339, "grad_norm": 0.19506779313087463, "learning_rate": 0.001, "loss": 1.7834, "step": 1109400 }, { "epoch": 64.89442592267649, "grad_norm": 0.19428414106369019, "learning_rate": 0.001, "loss": 1.7923, "step": 1109500 }, { "epoch": 64.9002749020296, "grad_norm": 0.1770133674144745, "learning_rate": 0.001, "loss": 1.7877, "step": 1109600 }, { "epoch": 64.9061238813827, "grad_norm": 0.17412221431732178, "learning_rate": 0.001, "loss": 1.7828, "step": 1109700 }, { "epoch": 64.9119728607358, "grad_norm": 0.24408948421478271, "learning_rate": 0.001, "loss": 1.788, "step": 1109800 }, { "epoch": 64.9178218400889, "grad_norm": 0.17226047813892365, "learning_rate": 0.001, "loss": 1.7868, "step": 1109900 }, { "epoch": 64.92367081944201, "grad_norm": 0.15190674364566803, "learning_rate": 0.001, "loss": 1.7865, "step": 1110000 }, { "epoch": 64.92951979879511, "grad_norm": 0.18138214945793152, "learning_rate": 0.001, "loss": 1.7878, "step": 1110100 }, { "epoch": 64.93536877814822, "grad_norm": 0.20996801555156708, "learning_rate": 0.001, "loss": 1.7875, "step": 1110200 }, { "epoch": 64.94121775750132, "grad_norm": 0.1987593173980713, "learning_rate": 0.001, "loss": 1.792, "step": 1110300 }, { "epoch": 64.94706673685442, "grad_norm": 0.17443262040615082, "learning_rate": 0.001, "loss": 1.7905, "step": 1110400 }, { "epoch": 64.95291571620753, "grad_norm": 0.14958734810352325, "learning_rate": 0.001, "loss": 1.7835, "step": 1110500 }, { "epoch": 64.95876469556063, "grad_norm": 0.15682166814804077, "learning_rate": 0.001, "loss": 1.7879, "step": 1110600 }, { "epoch": 64.96461367491372, "grad_norm": 0.16787898540496826, "learning_rate": 0.001, "loss": 1.7908, "step": 1110700 }, { "epoch": 64.97046265426683, "grad_norm": 0.19441920518875122, "learning_rate": 0.001, "loss": 1.7879, "step": 1110800 }, { "epoch": 64.97631163361993, "grad_norm": 0.1749098002910614, "learning_rate": 0.001, "loss": 1.7863, "step": 1110900 }, { "epoch": 64.98216061297303, "grad_norm": 0.20711462199687958, "learning_rate": 0.001, "loss": 1.7817, "step": 1111000 }, { "epoch": 64.98800959232614, "grad_norm": 0.21018019318580627, "learning_rate": 0.001, "loss": 1.7903, "step": 1111100 }, { "epoch": 64.99385857167924, "grad_norm": 0.1790655553340912, "learning_rate": 0.001, "loss": 1.7881, "step": 1111200 }, { "epoch": 64.99970755103234, "grad_norm": 0.16025441884994507, "learning_rate": 0.001, "loss": 1.7859, "step": 1111300 }, { "epoch": 65.00555653038545, "grad_norm": 0.2811003029346466, "learning_rate": 0.001, "loss": 1.7795, "step": 1111400 }, { "epoch": 65.01140550973855, "grad_norm": 0.1858116239309311, "learning_rate": 0.001, "loss": 1.7725, "step": 1111500 }, { "epoch": 65.01725448909166, "grad_norm": 0.1849815547466278, "learning_rate": 0.001, "loss": 1.771, "step": 1111600 }, { "epoch": 65.02310346844476, "grad_norm": 0.1715686023235321, "learning_rate": 0.001, "loss": 1.7741, "step": 1111700 }, { "epoch": 65.02895244779786, "grad_norm": 0.20481759309768677, "learning_rate": 0.001, "loss": 1.7799, "step": 1111800 }, { "epoch": 65.03480142715097, "grad_norm": 0.2085033804178238, "learning_rate": 0.001, "loss": 1.7739, "step": 1111900 }, { "epoch": 65.04065040650407, "grad_norm": 0.14969591796398163, "learning_rate": 0.001, "loss": 1.7698, "step": 1112000 }, { "epoch": 65.04649938585717, "grad_norm": 0.17013803124427795, "learning_rate": 0.001, "loss": 1.7728, "step": 1112100 }, { "epoch": 65.05234836521026, "grad_norm": 0.1974477618932724, "learning_rate": 0.001, "loss": 1.7692, "step": 1112200 }, { "epoch": 65.05819734456337, "grad_norm": 0.21256422996520996, "learning_rate": 0.001, "loss": 1.7751, "step": 1112300 }, { "epoch": 65.06404632391647, "grad_norm": 0.25140050053596497, "learning_rate": 0.001, "loss": 1.7824, "step": 1112400 }, { "epoch": 65.06989530326958, "grad_norm": 0.2531537413597107, "learning_rate": 0.001, "loss": 1.7812, "step": 1112500 }, { "epoch": 65.07574428262268, "grad_norm": 0.21962065994739532, "learning_rate": 0.001, "loss": 1.7698, "step": 1112600 }, { "epoch": 65.08159326197578, "grad_norm": 0.15313349664211273, "learning_rate": 0.001, "loss": 1.7776, "step": 1112700 }, { "epoch": 65.08744224132889, "grad_norm": 0.19441990554332733, "learning_rate": 0.001, "loss": 1.7804, "step": 1112800 }, { "epoch": 65.09329122068199, "grad_norm": 0.16973398625850677, "learning_rate": 0.001, "loss": 1.7785, "step": 1112900 }, { "epoch": 65.0991402000351, "grad_norm": 0.1596592217683792, "learning_rate": 0.001, "loss": 1.7754, "step": 1113000 }, { "epoch": 65.1049891793882, "grad_norm": 0.15372994542121887, "learning_rate": 0.001, "loss": 1.7736, "step": 1113100 }, { "epoch": 65.1108381587413, "grad_norm": 0.15667594969272614, "learning_rate": 0.001, "loss": 1.7697, "step": 1113200 }, { "epoch": 65.1166871380944, "grad_norm": 0.22295746207237244, "learning_rate": 0.001, "loss": 1.7749, "step": 1113300 }, { "epoch": 65.12253611744751, "grad_norm": 0.17880377173423767, "learning_rate": 0.001, "loss": 1.7781, "step": 1113400 }, { "epoch": 65.12838509680061, "grad_norm": 0.14929254353046417, "learning_rate": 0.001, "loss": 1.7767, "step": 1113500 }, { "epoch": 65.13423407615372, "grad_norm": 0.1872067004442215, "learning_rate": 0.001, "loss": 1.7773, "step": 1113600 }, { "epoch": 65.14008305550682, "grad_norm": 0.21811337769031525, "learning_rate": 0.001, "loss": 1.7779, "step": 1113700 }, { "epoch": 65.14593203485991, "grad_norm": 0.17563465237617493, "learning_rate": 0.001, "loss": 1.7757, "step": 1113800 }, { "epoch": 65.15178101421301, "grad_norm": 0.1541358232498169, "learning_rate": 0.001, "loss": 1.778, "step": 1113900 }, { "epoch": 65.15762999356612, "grad_norm": 0.16128572821617126, "learning_rate": 0.001, "loss": 1.7803, "step": 1114000 }, { "epoch": 65.16347897291922, "grad_norm": 0.20588169991970062, "learning_rate": 0.001, "loss": 1.7796, "step": 1114100 }, { "epoch": 65.16932795227233, "grad_norm": 0.18362215161323547, "learning_rate": 0.001, "loss": 1.7753, "step": 1114200 }, { "epoch": 65.17517693162543, "grad_norm": 0.1557137370109558, "learning_rate": 0.001, "loss": 1.7727, "step": 1114300 }, { "epoch": 65.18102591097853, "grad_norm": 0.19468462467193604, "learning_rate": 0.001, "loss": 1.7765, "step": 1114400 }, { "epoch": 65.18687489033164, "grad_norm": 0.18676520884037018, "learning_rate": 0.001, "loss": 1.7848, "step": 1114500 }, { "epoch": 65.19272386968474, "grad_norm": 0.16563303768634796, "learning_rate": 0.001, "loss": 1.7791, "step": 1114600 }, { "epoch": 65.19857284903784, "grad_norm": 0.18986301124095917, "learning_rate": 0.001, "loss": 1.7772, "step": 1114700 }, { "epoch": 65.20442182839095, "grad_norm": 0.15648716688156128, "learning_rate": 0.001, "loss": 1.7766, "step": 1114800 }, { "epoch": 65.21027080774405, "grad_norm": 0.15404729545116425, "learning_rate": 0.001, "loss": 1.7805, "step": 1114900 }, { "epoch": 65.21611978709716, "grad_norm": 0.2691963315010071, "learning_rate": 0.001, "loss": 1.7754, "step": 1115000 }, { "epoch": 65.22196876645026, "grad_norm": 0.20294180512428284, "learning_rate": 0.001, "loss": 1.7768, "step": 1115100 }, { "epoch": 65.22781774580336, "grad_norm": 0.14433029294013977, "learning_rate": 0.001, "loss": 1.7754, "step": 1115200 }, { "epoch": 65.23366672515645, "grad_norm": 0.2284480482339859, "learning_rate": 0.001, "loss": 1.7812, "step": 1115300 }, { "epoch": 65.23951570450956, "grad_norm": 0.20042139291763306, "learning_rate": 0.001, "loss": 1.7771, "step": 1115400 }, { "epoch": 65.24536468386266, "grad_norm": 0.2048271894454956, "learning_rate": 0.001, "loss": 1.7776, "step": 1115500 }, { "epoch": 65.25121366321576, "grad_norm": 0.17792841792106628, "learning_rate": 0.001, "loss": 1.7832, "step": 1115600 }, { "epoch": 65.25706264256887, "grad_norm": 0.171097531914711, "learning_rate": 0.001, "loss": 1.7822, "step": 1115700 }, { "epoch": 65.26291162192197, "grad_norm": 0.20384041965007782, "learning_rate": 0.001, "loss": 1.7818, "step": 1115800 }, { "epoch": 65.26876060127508, "grad_norm": 0.19631634652614594, "learning_rate": 0.001, "loss": 1.7848, "step": 1115900 }, { "epoch": 65.27460958062818, "grad_norm": 0.22134748101234436, "learning_rate": 0.001, "loss": 1.7822, "step": 1116000 }, { "epoch": 65.28045855998128, "grad_norm": 0.15388265252113342, "learning_rate": 0.001, "loss": 1.7815, "step": 1116100 }, { "epoch": 65.28630753933439, "grad_norm": 0.207337886095047, "learning_rate": 0.001, "loss": 1.78, "step": 1116200 }, { "epoch": 65.29215651868749, "grad_norm": 0.15173032879829407, "learning_rate": 0.001, "loss": 1.7775, "step": 1116300 }, { "epoch": 65.2980054980406, "grad_norm": 0.20376233756542206, "learning_rate": 0.001, "loss": 1.7763, "step": 1116400 }, { "epoch": 65.3038544773937, "grad_norm": 0.1676371693611145, "learning_rate": 0.001, "loss": 1.7753, "step": 1116500 }, { "epoch": 65.3097034567468, "grad_norm": 0.18088486790657043, "learning_rate": 0.001, "loss": 1.7832, "step": 1116600 }, { "epoch": 65.3155524360999, "grad_norm": 0.1830408275127411, "learning_rate": 0.001, "loss": 1.7771, "step": 1116700 }, { "epoch": 65.32140141545301, "grad_norm": 0.14994539320468903, "learning_rate": 0.001, "loss": 1.7756, "step": 1116800 }, { "epoch": 65.3272503948061, "grad_norm": 0.1989208161830902, "learning_rate": 0.001, "loss": 1.7802, "step": 1116900 }, { "epoch": 65.3330993741592, "grad_norm": 0.16725996136665344, "learning_rate": 0.001, "loss": 1.7783, "step": 1117000 }, { "epoch": 65.33894835351231, "grad_norm": 0.1662929803133011, "learning_rate": 0.001, "loss": 1.774, "step": 1117100 }, { "epoch": 65.34479733286541, "grad_norm": 0.18573109805583954, "learning_rate": 0.001, "loss": 1.7801, "step": 1117200 }, { "epoch": 65.35064631221852, "grad_norm": 0.23094713687896729, "learning_rate": 0.001, "loss": 1.7815, "step": 1117300 }, { "epoch": 65.35649529157162, "grad_norm": 0.21657274663448334, "learning_rate": 0.001, "loss": 1.772, "step": 1117400 }, { "epoch": 65.36234427092472, "grad_norm": 0.14628978073596954, "learning_rate": 0.001, "loss": 1.7738, "step": 1117500 }, { "epoch": 65.36819325027783, "grad_norm": 0.16470633447170258, "learning_rate": 0.001, "loss": 1.7848, "step": 1117600 }, { "epoch": 65.37404222963093, "grad_norm": 0.21005690097808838, "learning_rate": 0.001, "loss": 1.7792, "step": 1117700 }, { "epoch": 65.37989120898403, "grad_norm": 0.1497117131948471, "learning_rate": 0.001, "loss": 1.7769, "step": 1117800 }, { "epoch": 65.38574018833714, "grad_norm": 0.2085028439760208, "learning_rate": 0.001, "loss": 1.7797, "step": 1117900 }, { "epoch": 65.39158916769024, "grad_norm": 0.1802169382572174, "learning_rate": 0.001, "loss": 1.7843, "step": 1118000 }, { "epoch": 65.39743814704335, "grad_norm": 0.1732330471277237, "learning_rate": 0.001, "loss": 1.7762, "step": 1118100 }, { "epoch": 65.40328712639645, "grad_norm": 0.1912691444158554, "learning_rate": 0.001, "loss": 1.7816, "step": 1118200 }, { "epoch": 65.40913610574955, "grad_norm": 0.16341319680213928, "learning_rate": 0.001, "loss": 1.7792, "step": 1118300 }, { "epoch": 65.41498508510264, "grad_norm": 0.14467157423496246, "learning_rate": 0.001, "loss": 1.7785, "step": 1118400 }, { "epoch": 65.42083406445575, "grad_norm": 0.14189685881137848, "learning_rate": 0.001, "loss": 1.7834, "step": 1118500 }, { "epoch": 65.42668304380885, "grad_norm": 0.1586247831583023, "learning_rate": 0.001, "loss": 1.7791, "step": 1118600 }, { "epoch": 65.43253202316195, "grad_norm": 0.1700916588306427, "learning_rate": 0.001, "loss": 1.7848, "step": 1118700 }, { "epoch": 65.43838100251506, "grad_norm": 0.1752864122390747, "learning_rate": 0.001, "loss": 1.7767, "step": 1118800 }, { "epoch": 65.44422998186816, "grad_norm": 0.19181646406650543, "learning_rate": 0.001, "loss": 1.7851, "step": 1118900 }, { "epoch": 65.45007896122127, "grad_norm": 0.1881876289844513, "learning_rate": 0.001, "loss": 1.7785, "step": 1119000 }, { "epoch": 65.45592794057437, "grad_norm": 0.21550258994102478, "learning_rate": 0.001, "loss": 1.7774, "step": 1119100 }, { "epoch": 65.46177691992747, "grad_norm": 0.17654000222682953, "learning_rate": 0.001, "loss": 1.7868, "step": 1119200 }, { "epoch": 65.46762589928058, "grad_norm": 0.27641135454177856, "learning_rate": 0.001, "loss": 1.7802, "step": 1119300 }, { "epoch": 65.47347487863368, "grad_norm": 0.18686620891094208, "learning_rate": 0.001, "loss": 1.7781, "step": 1119400 }, { "epoch": 65.47932385798678, "grad_norm": 0.1784030646085739, "learning_rate": 0.001, "loss": 1.783, "step": 1119500 }, { "epoch": 65.48517283733989, "grad_norm": 0.33989912271499634, "learning_rate": 0.001, "loss": 1.7828, "step": 1119600 }, { "epoch": 65.49102181669299, "grad_norm": 0.13608220219612122, "learning_rate": 0.001, "loss": 1.7812, "step": 1119700 }, { "epoch": 65.4968707960461, "grad_norm": 0.19204379618167877, "learning_rate": 0.001, "loss": 1.7775, "step": 1119800 }, { "epoch": 65.5027197753992, "grad_norm": 0.21455585956573486, "learning_rate": 0.001, "loss": 1.7843, "step": 1119900 }, { "epoch": 65.50856875475229, "grad_norm": 0.159841850399971, "learning_rate": 0.001, "loss": 1.7802, "step": 1120000 }, { "epoch": 65.5144177341054, "grad_norm": 0.15243497490882874, "learning_rate": 0.001, "loss": 1.7845, "step": 1120100 }, { "epoch": 65.5202667134585, "grad_norm": 0.17359189689159393, "learning_rate": 0.001, "loss": 1.7851, "step": 1120200 }, { "epoch": 65.5261156928116, "grad_norm": 0.2955806851387024, "learning_rate": 0.001, "loss": 1.7806, "step": 1120300 }, { "epoch": 65.5319646721647, "grad_norm": 0.1932179182767868, "learning_rate": 0.001, "loss": 1.7751, "step": 1120400 }, { "epoch": 65.53781365151781, "grad_norm": 0.1744602620601654, "learning_rate": 0.001, "loss": 1.7823, "step": 1120500 }, { "epoch": 65.54366263087091, "grad_norm": 0.15693148970603943, "learning_rate": 0.001, "loss": 1.7808, "step": 1120600 }, { "epoch": 65.54951161022402, "grad_norm": 0.16704773902893066, "learning_rate": 0.001, "loss": 1.7779, "step": 1120700 }, { "epoch": 65.55536058957712, "grad_norm": 0.1949620097875595, "learning_rate": 0.001, "loss": 1.786, "step": 1120800 }, { "epoch": 65.56120956893022, "grad_norm": 0.18617381155490875, "learning_rate": 0.001, "loss": 1.7834, "step": 1120900 }, { "epoch": 65.56705854828333, "grad_norm": 0.19428327679634094, "learning_rate": 0.001, "loss": 1.7792, "step": 1121000 }, { "epoch": 65.57290752763643, "grad_norm": 0.16410933434963226, "learning_rate": 0.001, "loss": 1.7762, "step": 1121100 }, { "epoch": 65.57875650698954, "grad_norm": 0.17427492141723633, "learning_rate": 0.001, "loss": 1.7845, "step": 1121200 }, { "epoch": 65.58460548634264, "grad_norm": 0.22092628479003906, "learning_rate": 0.001, "loss": 1.7785, "step": 1121300 }, { "epoch": 65.59045446569574, "grad_norm": 0.19207029044628143, "learning_rate": 0.001, "loss": 1.7815, "step": 1121400 }, { "epoch": 65.59630344504883, "grad_norm": 0.2809124290943146, "learning_rate": 0.001, "loss": 1.7919, "step": 1121500 }, { "epoch": 65.60215242440194, "grad_norm": 0.17992812395095825, "learning_rate": 0.001, "loss": 1.7853, "step": 1121600 }, { "epoch": 65.60800140375504, "grad_norm": 0.18636511266231537, "learning_rate": 0.001, "loss": 1.7782, "step": 1121700 }, { "epoch": 65.61385038310814, "grad_norm": 0.14819608628749847, "learning_rate": 0.001, "loss": 1.7831, "step": 1121800 }, { "epoch": 65.61969936246125, "grad_norm": 0.2538333237171173, "learning_rate": 0.001, "loss": 1.783, "step": 1121900 }, { "epoch": 65.62554834181435, "grad_norm": 0.22089222073554993, "learning_rate": 0.001, "loss": 1.783, "step": 1122000 }, { "epoch": 65.63139732116746, "grad_norm": 0.2164798378944397, "learning_rate": 0.001, "loss": 1.7791, "step": 1122100 }, { "epoch": 65.63724630052056, "grad_norm": 0.17404697835445404, "learning_rate": 0.001, "loss": 1.7809, "step": 1122200 }, { "epoch": 65.64309527987366, "grad_norm": 0.14839200675487518, "learning_rate": 0.001, "loss": 1.783, "step": 1122300 }, { "epoch": 65.64894425922677, "grad_norm": 0.15699896216392517, "learning_rate": 0.001, "loss": 1.7782, "step": 1122400 }, { "epoch": 65.65479323857987, "grad_norm": 0.19418373703956604, "learning_rate": 0.001, "loss": 1.7878, "step": 1122500 }, { "epoch": 65.66064221793297, "grad_norm": 0.21654841303825378, "learning_rate": 0.001, "loss": 1.7878, "step": 1122600 }, { "epoch": 65.66649119728608, "grad_norm": 0.1794600933790207, "learning_rate": 0.001, "loss": 1.7819, "step": 1122700 }, { "epoch": 65.67234017663918, "grad_norm": 0.17688651382923126, "learning_rate": 0.001, "loss": 1.7753, "step": 1122800 }, { "epoch": 65.67818915599229, "grad_norm": 0.1729412078857422, "learning_rate": 0.001, "loss": 1.7806, "step": 1122900 }, { "epoch": 65.68403813534539, "grad_norm": 0.1620074063539505, "learning_rate": 0.001, "loss": 1.7817, "step": 1123000 }, { "epoch": 65.68988711469848, "grad_norm": 0.1941128671169281, "learning_rate": 0.001, "loss": 1.7803, "step": 1123100 }, { "epoch": 65.69573609405158, "grad_norm": 0.1945410519838333, "learning_rate": 0.001, "loss": 1.7892, "step": 1123200 }, { "epoch": 65.70158507340469, "grad_norm": 0.14279261231422424, "learning_rate": 0.001, "loss": 1.7829, "step": 1123300 }, { "epoch": 65.70743405275779, "grad_norm": 0.19254349172115326, "learning_rate": 0.001, "loss": 1.7845, "step": 1123400 }, { "epoch": 65.7132830321109, "grad_norm": 0.17758306860923767, "learning_rate": 0.001, "loss": 1.7845, "step": 1123500 }, { "epoch": 65.719132011464, "grad_norm": 0.17593713104724884, "learning_rate": 0.001, "loss": 1.7826, "step": 1123600 }, { "epoch": 65.7249809908171, "grad_norm": 0.14832235872745514, "learning_rate": 0.001, "loss": 1.7825, "step": 1123700 }, { "epoch": 65.7308299701702, "grad_norm": 0.26932060718536377, "learning_rate": 0.001, "loss": 1.7768, "step": 1123800 }, { "epoch": 65.73667894952331, "grad_norm": 0.17091824114322662, "learning_rate": 0.001, "loss": 1.7822, "step": 1123900 }, { "epoch": 65.74252792887641, "grad_norm": 0.16432271897792816, "learning_rate": 0.001, "loss": 1.7817, "step": 1124000 }, { "epoch": 65.74837690822952, "grad_norm": 0.14269781112670898, "learning_rate": 0.001, "loss": 1.7881, "step": 1124100 }, { "epoch": 65.75422588758262, "grad_norm": 0.21333135664463043, "learning_rate": 0.001, "loss": 1.7843, "step": 1124200 }, { "epoch": 65.76007486693572, "grad_norm": 0.2674999535083771, "learning_rate": 0.001, "loss": 1.7859, "step": 1124300 }, { "epoch": 65.76592384628883, "grad_norm": 0.1554381251335144, "learning_rate": 0.001, "loss": 1.7848, "step": 1124400 }, { "epoch": 65.77177282564193, "grad_norm": 0.1841975897550583, "learning_rate": 0.001, "loss": 1.7861, "step": 1124500 }, { "epoch": 65.77762180499502, "grad_norm": 0.20012477040290833, "learning_rate": 0.001, "loss": 1.7861, "step": 1124600 }, { "epoch": 65.78347078434813, "grad_norm": 0.13300107419490814, "learning_rate": 0.001, "loss": 1.7825, "step": 1124700 }, { "epoch": 65.78931976370123, "grad_norm": 0.1832556426525116, "learning_rate": 0.001, "loss": 1.7763, "step": 1124800 }, { "epoch": 65.79516874305433, "grad_norm": 0.1585221141576767, "learning_rate": 0.001, "loss": 1.7848, "step": 1124900 }, { "epoch": 65.80101772240744, "grad_norm": 0.22409258782863617, "learning_rate": 0.001, "loss": 1.783, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_ag_news_accuracy": 0.2415625, "eval_ag_news_bleu_score": 7.081561717062977, "eval_ag_news_bleu_score_sem": 0.4880678980729386, "eval_ag_news_emb_cos_sim": 0.7263100743293762, "eval_ag_news_emb_cos_sim_sem": 0.014579386450350285, "eval_ag_news_emb_top1_equal": 0.953125, "eval_ag_news_emb_top1_equal_sem": 0.01875615119934082, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.705955982208252, "eval_ag_news_n_ngrams_match_1": 14.7109375, "eval_ag_news_n_ngrams_match_2": 4.375, "eval_ag_news_n_ngrams_match_3": 1.578125, "eval_ag_news_num_pred_words": 47.8046875, "eval_ag_news_num_true_words": 45.5546875, "eval_ag_news_perplexity": 14.968619586786648, "eval_ag_news_pred_num_tokens": 71.28125, "eval_ag_news_rouge_score": 0.3098693333363918, "eval_ag_news_runtime": 48.7045, "eval_ag_news_samples_per_second": 10.266, "eval_ag_news_steps_per_second": 0.021, "eval_ag_news_token_set_f1": 0.3428929680115611, "eval_ag_news_token_set_f1_sem": 0.010374579407212994, "eval_ag_news_token_set_precision": 0.3287941250231961, "eval_ag_news_token_set_recall": 0.3652148874779184, "eval_ag_news_true_num_tokens": 63.3828125, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_anthropic_toxic_prompts_accuracy": 0.104421875, "eval_anthropic_toxic_prompts_bleu_score": 40.80733859489749, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.4871301161009876, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.881140410900116, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.010364538989961147, "eval_anthropic_toxic_prompts_emb_top1_equal": 0.9921875, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0078125, "eval_anthropic_toxic_prompts_exact_match": 0.1171875, "eval_anthropic_toxic_prompts_exact_match_sem": 0.02854125206846796, "eval_anthropic_toxic_prompts_loss": 1.2145168781280518, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 10.515625, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 6.4296875, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 4.2890625, "eval_anthropic_toxic_prompts_num_pred_words": 17.3125, "eval_anthropic_toxic_prompts_num_true_words": 17.1640625, "eval_anthropic_toxic_prompts_perplexity": 3.368666194805583, "eval_anthropic_toxic_prompts_pred_num_tokens": 22.8671875, "eval_anthropic_toxic_prompts_rouge_score": 0.6688412908625486, "eval_anthropic_toxic_prompts_runtime": 38.6708, "eval_anthropic_toxic_prompts_samples_per_second": 12.93, "eval_anthropic_toxic_prompts_steps_per_second": 0.026, "eval_anthropic_toxic_prompts_token_set_f1": 0.6939288339987634, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.017858493213003885, "eval_anthropic_toxic_prompts_token_set_precision": 0.6912190831779912, "eval_anthropic_toxic_prompts_token_set_recall": 0.7037132105297378, "eval_anthropic_toxic_prompts_true_num_tokens": 21.0390625, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_arxiv_accuracy": 0.377734375, "eval_arxiv_bleu_score": 2.0493469838126552, "eval_arxiv_bleu_score_sem": 0.18366502498746812, "eval_arxiv_emb_cos_sim": 0.47086301445961, "eval_arxiv_emb_cos_sim_sem": 0.018076090142130852, "eval_arxiv_emb_top1_equal": 0.921875, "eval_arxiv_emb_top1_equal_sem": 0.023813825100660324, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.4089603424072266, "eval_arxiv_n_ngrams_match_1": 14.3359375, "eval_arxiv_n_ngrams_match_2": 2.7421875, "eval_arxiv_n_ngrams_match_3": 0.7578125, "eval_arxiv_num_pred_words": 55.6328125, "eval_arxiv_num_true_words": 86.265625, "eval_arxiv_perplexity": 30.23379511940999, "eval_arxiv_pred_num_tokens": 126.046875, "eval_arxiv_rouge_score": 0.18590883940384598, "eval_arxiv_runtime": 41.5058, "eval_arxiv_samples_per_second": 12.046, "eval_arxiv_steps_per_second": 0.024, "eval_arxiv_token_set_f1": 0.19076024266372213, "eval_arxiv_token_set_f1_sem": 0.008706323566942642, "eval_arxiv_token_set_precision": 0.13123971631538078, "eval_arxiv_token_set_recall": 0.4273396003083628, "eval_arxiv_true_num_tokens": 124.2734375, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_python_code_alpaca_accuracy": 0.136078125, "eval_python_code_alpaca_bleu_score": 28.74615833179668, "eval_python_code_alpaca_bleu_score_sem": 1.7484099027770423, "eval_python_code_alpaca_emb_cos_sim": 0.8865448236465454, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008116018027067184, "eval_python_code_alpaca_emb_top1_equal": 0.9921875, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0078125, "eval_python_code_alpaca_exact_match": 0.015625, "eval_python_code_alpaca_exact_match_sem": 0.011004959288293975, "eval_python_code_alpaca_loss": 1.5035314559936523, "eval_python_code_alpaca_n_ngrams_match_1": 10.3359375, "eval_python_code_alpaca_n_ngrams_match_2": 5.6875, "eval_python_code_alpaca_n_ngrams_match_3": 3.203125, "eval_python_code_alpaca_num_pred_words": 17.4375, "eval_python_code_alpaca_num_true_words": 18.0234375, "eval_python_code_alpaca_perplexity": 4.497543936971691, "eval_python_code_alpaca_pred_num_tokens": 23.015625, "eval_python_code_alpaca_rouge_score": 0.6144362336684277, "eval_python_code_alpaca_runtime": 36.5019, "eval_python_code_alpaca_samples_per_second": 13.698, "eval_python_code_alpaca_steps_per_second": 0.027, "eval_python_code_alpaca_token_set_f1": 0.6220566337289071, "eval_python_code_alpaca_token_set_f1_sem": 0.014383462149910115, "eval_python_code_alpaca_token_set_precision": 0.6102724861724117, "eval_python_code_alpaca_token_set_recall": 0.6393527512083867, "eval_python_code_alpaca_true_num_tokens": 23.4140625, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_wikibio_accuracy": 0.375453125, "eval_wikibio_bleu_score": 7.995815762343016, "eval_wikibio_bleu_score_sem": 0.7398482245841974, "eval_wikibio_emb_cos_sim": 0.6104837656021118, "eval_wikibio_emb_cos_sim_sem": 0.02195647731423378, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.657536506652832, "eval_wikibio_n_ngrams_match_1": 16.5, "eval_wikibio_n_ngrams_match_2": 5.4140625, "eval_wikibio_n_ngrams_match_3": 2.25, "eval_wikibio_num_pred_words": 54.7421875, "eval_wikibio_num_true_words": 54.3203125, "eval_wikibio_perplexity": 14.261113630648959, "eval_wikibio_pred_num_tokens": 109.046875, "eval_wikibio_rouge_score": 0.3181041124442328, "eval_wikibio_runtime": 38.6301, "eval_wikibio_samples_per_second": 12.943, "eval_wikibio_steps_per_second": 0.026, "eval_wikibio_token_set_f1": 0.3303363814258305, "eval_wikibio_token_set_f1_sem": 0.011291201473396625, "eval_wikibio_token_set_precision": 0.29825569279768527, "eval_wikibio_token_set_recall": 0.39984559458194446, "eval_wikibio_true_num_tokens": 103.21875, "step": 1125000 }, { "epoch": 65.80101772240744, "eval_msmarco_accuracy": 0.3936875, "eval_msmarco_bleu_score": 17.254682091774225, "eval_msmarco_bleu_score_sem": 1.4750955530911278, "eval_msmarco_emb_cos_sim": 0.7701455354690552, "eval_msmarco_emb_cos_sim_sem": 0.017688658088445663, "eval_msmarco_emb_top1_equal": 0.9140625, "eval_msmarco_emb_top1_equal_sem": 0.024870097637176514, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.7251665592193604, "eval_msmarco_n_ngrams_match_1": 27.2578125, "eval_msmarco_n_ngrams_match_2": 12.0390625, "eval_msmarco_n_ngrams_match_3": 6.75, "eval_msmarco_num_pred_words": 61.40625, "eval_msmarco_num_true_words": 61.2421875, "eval_msmarco_perplexity": 5.613455924670033, "eval_msmarco_pred_num_tokens": 85.7578125, "eval_msmarco_rouge_score": 0.4401091948887449, "eval_msmarco_runtime": 29.3265, "eval_msmarco_samples_per_second": 17.049, "eval_msmarco_steps_per_second": 0.034, "eval_msmarco_token_set_f1": 0.46368338518826263, "eval_msmarco_token_set_f1_sem": 0.014683283493924762, "eval_msmarco_token_set_precision": 0.4311706411667294, "eval_msmarco_token_set_recall": 0.5277701803551745, "eval_msmarco_true_num_tokens": 80.1484375, "step": 1125000 }, { "epoch": 65.80686670176054, "grad_norm": 0.14556138217449188, "learning_rate": 0.001, "loss": 1.7871, "step": 1125100 }, { "epoch": 65.81271568111364, "grad_norm": 0.15218393504619598, "learning_rate": 0.001, "loss": 1.7856, "step": 1125200 }, { "epoch": 65.81856466046675, "grad_norm": 0.17588217556476593, "learning_rate": 0.001, "loss": 1.7855, "step": 1125300 }, { "epoch": 65.82441363981985, "grad_norm": 0.2363882213830948, "learning_rate": 0.001, "loss": 1.7864, "step": 1125400 }, { "epoch": 65.83026261917296, "grad_norm": 0.18664167821407318, "learning_rate": 0.001, "loss": 1.7897, "step": 1125500 }, { "epoch": 65.83611159852606, "grad_norm": 0.14893920719623566, "learning_rate": 0.001, "loss": 1.7876, "step": 1125600 }, { "epoch": 65.84196057787916, "grad_norm": 0.15619556605815887, "learning_rate": 0.001, "loss": 1.7855, "step": 1125700 }, { "epoch": 65.84780955723227, "grad_norm": 0.16948439180850983, "learning_rate": 0.001, "loss": 1.7834, "step": 1125800 }, { "epoch": 65.85365853658537, "grad_norm": 0.18653692305088043, "learning_rate": 0.001, "loss": 1.7852, "step": 1125900 }, { "epoch": 65.85950751593847, "grad_norm": 0.21326427161693573, "learning_rate": 0.001, "loss": 1.7892, "step": 1126000 }, { "epoch": 65.86535649529158, "grad_norm": 0.20384500920772552, "learning_rate": 0.001, "loss": 1.7844, "step": 1126100 }, { "epoch": 65.87120547464467, "grad_norm": 0.1700090914964676, "learning_rate": 0.001, "loss": 1.7873, "step": 1126200 }, { "epoch": 65.87705445399777, "grad_norm": 0.14967317879199982, "learning_rate": 0.001, "loss": 1.7844, "step": 1126300 }, { "epoch": 65.88290343335088, "grad_norm": 0.18342000246047974, "learning_rate": 0.001, "loss": 1.784, "step": 1126400 }, { "epoch": 65.88875241270398, "grad_norm": 0.17612040042877197, "learning_rate": 0.001, "loss": 1.783, "step": 1126500 }, { "epoch": 65.89460139205708, "grad_norm": 0.1529543399810791, "learning_rate": 0.001, "loss": 1.7821, "step": 1126600 }, { "epoch": 65.90045037141019, "grad_norm": 0.1890793740749359, "learning_rate": 0.001, "loss": 1.7868, "step": 1126700 }, { "epoch": 65.90629935076329, "grad_norm": 0.20050659775733948, "learning_rate": 0.001, "loss": 1.7876, "step": 1126800 }, { "epoch": 65.9121483301164, "grad_norm": 0.14052118360996246, "learning_rate": 0.001, "loss": 1.7787, "step": 1126900 }, { "epoch": 65.9179973094695, "grad_norm": 0.17772644758224487, "learning_rate": 0.001, "loss": 1.7812, "step": 1127000 }, { "epoch": 65.9238462888226, "grad_norm": 0.20203275978565216, "learning_rate": 0.001, "loss": 1.785, "step": 1127100 }, { "epoch": 65.9296952681757, "grad_norm": 0.19551411271095276, "learning_rate": 0.001, "loss": 1.7833, "step": 1127200 }, { "epoch": 65.93554424752881, "grad_norm": 0.1408131718635559, "learning_rate": 0.001, "loss": 1.7809, "step": 1127300 }, { "epoch": 65.94139322688191, "grad_norm": 0.17745649814605713, "learning_rate": 0.001, "loss": 1.7895, "step": 1127400 }, { "epoch": 65.94724220623502, "grad_norm": 0.17855654656887054, "learning_rate": 0.001, "loss": 1.7848, "step": 1127500 }, { "epoch": 65.95309118558812, "grad_norm": 0.1517089456319809, "learning_rate": 0.001, "loss": 1.7838, "step": 1127600 }, { "epoch": 65.95894016494121, "grad_norm": 0.1894020289182663, "learning_rate": 0.001, "loss": 1.7873, "step": 1127700 }, { "epoch": 65.96478914429431, "grad_norm": 0.18033985793590546, "learning_rate": 0.001, "loss": 1.7898, "step": 1127800 }, { "epoch": 65.97063812364742, "grad_norm": 0.24244777858257294, "learning_rate": 0.001, "loss": 1.7852, "step": 1127900 }, { "epoch": 65.97648710300052, "grad_norm": 0.14177237451076508, "learning_rate": 0.001, "loss": 1.7884, "step": 1128000 }, { "epoch": 65.98233608235363, "grad_norm": 0.289364218711853, "learning_rate": 0.001, "loss": 1.7876, "step": 1128100 }, { "epoch": 65.98818506170673, "grad_norm": 0.21131621301174164, "learning_rate": 0.001, "loss": 1.7962, "step": 1128200 }, { "epoch": 65.99403404105983, "grad_norm": 0.20193205773830414, "learning_rate": 0.001, "loss": 1.788, "step": 1128300 }, { "epoch": 65.99988302041294, "grad_norm": 0.22417530417442322, "learning_rate": 0.001, "loss": 1.7869, "step": 1128400 }, { "epoch": 66.00573199976604, "grad_norm": 0.2570534944534302, "learning_rate": 0.001, "loss": 1.7737, "step": 1128500 }, { "epoch": 66.01158097911915, "grad_norm": 0.1735260933637619, "learning_rate": 0.001, "loss": 1.7713, "step": 1128600 }, { "epoch": 66.01742995847225, "grad_norm": 0.16948780417442322, "learning_rate": 0.001, "loss": 1.7721, "step": 1128700 }, { "epoch": 66.02327893782535, "grad_norm": 0.19956074655056, "learning_rate": 0.001, "loss": 1.7697, "step": 1128800 }, { "epoch": 66.02912791717846, "grad_norm": 0.16665995121002197, "learning_rate": 0.001, "loss": 1.7746, "step": 1128900 }, { "epoch": 66.03497689653156, "grad_norm": 0.20353227853775024, "learning_rate": 0.001, "loss": 1.7708, "step": 1129000 }, { "epoch": 66.04082587588466, "grad_norm": 0.24509458243846893, "learning_rate": 0.001, "loss": 1.7738, "step": 1129100 }, { "epoch": 66.04667485523777, "grad_norm": 0.2308899164199829, "learning_rate": 0.001, "loss": 1.779, "step": 1129200 }, { "epoch": 66.05252383459086, "grad_norm": 0.16412046551704407, "learning_rate": 0.001, "loss": 1.7718, "step": 1129300 }, { "epoch": 66.05837281394396, "grad_norm": 0.19938336312770844, "learning_rate": 0.001, "loss": 1.7746, "step": 1129400 }, { "epoch": 66.06422179329707, "grad_norm": 0.16072092950344086, "learning_rate": 0.001, "loss": 1.7716, "step": 1129500 }, { "epoch": 66.07007077265017, "grad_norm": 0.16312170028686523, "learning_rate": 0.001, "loss": 1.7776, "step": 1129600 }, { "epoch": 66.07591975200327, "grad_norm": 0.1622936874628067, "learning_rate": 0.001, "loss": 1.7785, "step": 1129700 }, { "epoch": 66.08176873135638, "grad_norm": 0.23331570625305176, "learning_rate": 0.001, "loss": 1.7782, "step": 1129800 }, { "epoch": 66.08761771070948, "grad_norm": 0.22186560928821564, "learning_rate": 0.001, "loss": 1.7754, "step": 1129900 }, { "epoch": 66.09346669006258, "grad_norm": 0.14944346249103546, "learning_rate": 0.001, "loss": 1.7646, "step": 1130000 }, { "epoch": 66.09931566941569, "grad_norm": 0.22413331270217896, "learning_rate": 0.001, "loss": 1.7721, "step": 1130100 }, { "epoch": 66.10516464876879, "grad_norm": 0.19507597386837006, "learning_rate": 0.001, "loss": 1.7773, "step": 1130200 }, { "epoch": 66.1110136281219, "grad_norm": 0.20994634926319122, "learning_rate": 0.001, "loss": 1.7777, "step": 1130300 }, { "epoch": 66.116862607475, "grad_norm": 0.1987459510564804, "learning_rate": 0.001, "loss": 1.776, "step": 1130400 }, { "epoch": 66.1227115868281, "grad_norm": 0.19331656396389008, "learning_rate": 0.001, "loss": 1.7756, "step": 1130500 }, { "epoch": 66.1285605661812, "grad_norm": 0.18302534520626068, "learning_rate": 0.001, "loss": 1.7757, "step": 1130600 }, { "epoch": 66.13440954553431, "grad_norm": 0.2050216794013977, "learning_rate": 0.001, "loss": 1.77, "step": 1130700 }, { "epoch": 66.1402585248874, "grad_norm": 0.19419796764850616, "learning_rate": 0.001, "loss": 1.7788, "step": 1130800 }, { "epoch": 66.1461075042405, "grad_norm": 0.19652189314365387, "learning_rate": 0.001, "loss": 1.774, "step": 1130900 }, { "epoch": 66.15195648359361, "grad_norm": 0.19094832241535187, "learning_rate": 0.001, "loss": 1.7747, "step": 1131000 }, { "epoch": 66.15780546294671, "grad_norm": 0.30929654836654663, "learning_rate": 0.001, "loss": 1.7767, "step": 1131100 }, { "epoch": 66.16365444229982, "grad_norm": 0.2806834280490875, "learning_rate": 0.001, "loss": 1.7797, "step": 1131200 }, { "epoch": 66.16950342165292, "grad_norm": 0.2405361831188202, "learning_rate": 0.001, "loss": 1.7747, "step": 1131300 }, { "epoch": 66.17535240100602, "grad_norm": 0.1889273226261139, "learning_rate": 0.001, "loss": 1.7747, "step": 1131400 }, { "epoch": 66.18120138035913, "grad_norm": 0.18340328335762024, "learning_rate": 0.001, "loss": 1.772, "step": 1131500 }, { "epoch": 66.18705035971223, "grad_norm": 0.1632472723722458, "learning_rate": 0.001, "loss": 1.7799, "step": 1131600 }, { "epoch": 66.19289933906533, "grad_norm": 0.16681040823459625, "learning_rate": 0.001, "loss": 1.775, "step": 1131700 }, { "epoch": 66.19874831841844, "grad_norm": 0.18219369649887085, "learning_rate": 0.001, "loss": 1.7733, "step": 1131800 }, { "epoch": 66.20459729777154, "grad_norm": 0.18897564709186554, "learning_rate": 0.001, "loss": 1.7776, "step": 1131900 }, { "epoch": 66.21044627712465, "grad_norm": 0.18699988722801208, "learning_rate": 0.001, "loss": 1.7728, "step": 1132000 }, { "epoch": 66.21629525647775, "grad_norm": 0.2194381058216095, "learning_rate": 0.001, "loss": 1.781, "step": 1132100 }, { "epoch": 66.22214423583085, "grad_norm": 0.1849559247493744, "learning_rate": 0.001, "loss": 1.7769, "step": 1132200 }, { "epoch": 66.22799321518396, "grad_norm": 0.3229770362377167, "learning_rate": 0.001, "loss": 1.7783, "step": 1132300 }, { "epoch": 66.23384219453705, "grad_norm": 0.31813961267471313, "learning_rate": 0.001, "loss": 1.7779, "step": 1132400 }, { "epoch": 66.23969117389015, "grad_norm": 0.2441709190607071, "learning_rate": 0.001, "loss": 1.7757, "step": 1132500 }, { "epoch": 66.24554015324325, "grad_norm": 0.18021675944328308, "learning_rate": 0.001, "loss": 1.7778, "step": 1132600 }, { "epoch": 66.25138913259636, "grad_norm": 0.22362197935581207, "learning_rate": 0.001, "loss": 1.7799, "step": 1132700 }, { "epoch": 66.25723811194946, "grad_norm": 0.2249027043581009, "learning_rate": 0.001, "loss": 1.7789, "step": 1132800 }, { "epoch": 66.26308709130257, "grad_norm": 0.15229980647563934, "learning_rate": 0.001, "loss": 1.7766, "step": 1132900 }, { "epoch": 66.26893607065567, "grad_norm": 0.1989976018667221, "learning_rate": 0.001, "loss": 1.7768, "step": 1133000 }, { "epoch": 66.27478505000877, "grad_norm": 0.15661296248435974, "learning_rate": 0.001, "loss": 1.7816, "step": 1133100 }, { "epoch": 66.28063402936188, "grad_norm": 0.17370043694972992, "learning_rate": 0.001, "loss": 1.7774, "step": 1133200 }, { "epoch": 66.28648300871498, "grad_norm": 0.1743459850549698, "learning_rate": 0.001, "loss": 1.78, "step": 1133300 }, { "epoch": 66.29233198806809, "grad_norm": 0.17589309811592102, "learning_rate": 0.001, "loss": 1.7745, "step": 1133400 }, { "epoch": 66.29818096742119, "grad_norm": 0.22614921629428864, "learning_rate": 0.001, "loss": 1.7783, "step": 1133500 }, { "epoch": 66.30402994677429, "grad_norm": 0.20581865310668945, "learning_rate": 0.001, "loss": 1.7741, "step": 1133600 }, { "epoch": 66.3098789261274, "grad_norm": 0.17614911496639252, "learning_rate": 0.001, "loss": 1.778, "step": 1133700 }, { "epoch": 66.3157279054805, "grad_norm": 0.17908109724521637, "learning_rate": 0.001, "loss": 1.7822, "step": 1133800 }, { "epoch": 66.32157688483359, "grad_norm": 0.2517277002334595, "learning_rate": 0.001, "loss": 1.7793, "step": 1133900 }, { "epoch": 66.3274258641867, "grad_norm": 0.1524968296289444, "learning_rate": 0.001, "loss": 1.7773, "step": 1134000 }, { "epoch": 66.3332748435398, "grad_norm": 0.2017209380865097, "learning_rate": 0.001, "loss": 1.7752, "step": 1134100 }, { "epoch": 66.3391238228929, "grad_norm": 0.14989601075649261, "learning_rate": 0.001, "loss": 1.7806, "step": 1134200 }, { "epoch": 66.344972802246, "grad_norm": 0.1882144957780838, "learning_rate": 0.001, "loss": 1.7837, "step": 1134300 }, { "epoch": 66.35082178159911, "grad_norm": 0.19829009473323822, "learning_rate": 0.001, "loss": 1.7712, "step": 1134400 }, { "epoch": 66.35667076095221, "grad_norm": 0.200974240899086, "learning_rate": 0.001, "loss": 1.7765, "step": 1134500 }, { "epoch": 66.36251974030532, "grad_norm": 0.18640132248401642, "learning_rate": 0.001, "loss": 1.7803, "step": 1134600 }, { "epoch": 66.36836871965842, "grad_norm": 0.20568294823169708, "learning_rate": 0.001, "loss": 1.7831, "step": 1134700 }, { "epoch": 66.37421769901152, "grad_norm": 0.23271620273590088, "learning_rate": 0.001, "loss": 1.7824, "step": 1134800 }, { "epoch": 66.38006667836463, "grad_norm": 0.19943401217460632, "learning_rate": 0.001, "loss": 1.7768, "step": 1134900 }, { "epoch": 66.38591565771773, "grad_norm": 0.16116562485694885, "learning_rate": 0.001, "loss": 1.7807, "step": 1135000 }, { "epoch": 66.39176463707084, "grad_norm": 0.20953555405139923, "learning_rate": 0.001, "loss": 1.7774, "step": 1135100 }, { "epoch": 66.39761361642394, "grad_norm": 0.17106777429580688, "learning_rate": 0.001, "loss": 1.7767, "step": 1135200 }, { "epoch": 66.40346259577704, "grad_norm": 0.19183878600597382, "learning_rate": 0.001, "loss": 1.781, "step": 1135300 }, { "epoch": 66.40931157513015, "grad_norm": 0.24121004343032837, "learning_rate": 0.001, "loss": 1.7818, "step": 1135400 }, { "epoch": 66.41516055448324, "grad_norm": 0.18922242522239685, "learning_rate": 0.001, "loss": 1.7831, "step": 1135500 }, { "epoch": 66.42100953383634, "grad_norm": 0.15572844445705414, "learning_rate": 0.001, "loss": 1.7756, "step": 1135600 }, { "epoch": 66.42685851318944, "grad_norm": 0.20471124351024628, "learning_rate": 0.001, "loss": 1.7815, "step": 1135700 }, { "epoch": 66.43270749254255, "grad_norm": 0.1939702332019806, "learning_rate": 0.001, "loss": 1.7863, "step": 1135800 }, { "epoch": 66.43855647189565, "grad_norm": 0.2231166958808899, "learning_rate": 0.001, "loss": 1.7804, "step": 1135900 }, { "epoch": 66.44440545124876, "grad_norm": 0.17732849717140198, "learning_rate": 0.001, "loss": 1.7771, "step": 1136000 }, { "epoch": 66.45025443060186, "grad_norm": 0.18434780836105347, "learning_rate": 0.001, "loss": 1.7816, "step": 1136100 }, { "epoch": 66.45610340995496, "grad_norm": 0.22478756308555603, "learning_rate": 0.001, "loss": 1.7832, "step": 1136200 }, { "epoch": 66.46195238930807, "grad_norm": 0.21312543749809265, "learning_rate": 0.001, "loss": 1.7827, "step": 1136300 }, { "epoch": 66.46780136866117, "grad_norm": 0.17134952545166016, "learning_rate": 0.001, "loss": 1.7742, "step": 1136400 }, { "epoch": 66.47365034801427, "grad_norm": 0.16516301035881042, "learning_rate": 0.001, "loss": 1.7806, "step": 1136500 }, { "epoch": 66.47949932736738, "grad_norm": 0.19354364275932312, "learning_rate": 0.001, "loss": 1.7805, "step": 1136600 }, { "epoch": 66.48534830672048, "grad_norm": 0.22415798902511597, "learning_rate": 0.001, "loss": 1.7817, "step": 1136700 }, { "epoch": 66.49119728607359, "grad_norm": 0.20779018104076385, "learning_rate": 0.001, "loss": 1.7823, "step": 1136800 }, { "epoch": 66.49704626542669, "grad_norm": 0.21742059290409088, "learning_rate": 0.001, "loss": 1.7808, "step": 1136900 }, { "epoch": 66.50289524477978, "grad_norm": 0.17965435981750488, "learning_rate": 0.001, "loss": 1.7837, "step": 1137000 }, { "epoch": 66.50874422413288, "grad_norm": 0.28522494435310364, "learning_rate": 0.001, "loss": 1.7841, "step": 1137100 }, { "epoch": 66.51459320348599, "grad_norm": 0.24160045385360718, "learning_rate": 0.001, "loss": 1.7777, "step": 1137200 }, { "epoch": 66.52044218283909, "grad_norm": 0.18112508952617645, "learning_rate": 0.001, "loss": 1.7827, "step": 1137300 }, { "epoch": 66.5262911621922, "grad_norm": 0.18476396799087524, "learning_rate": 0.001, "loss": 1.7836, "step": 1137400 }, { "epoch": 66.5321401415453, "grad_norm": 0.18923360109329224, "learning_rate": 0.001, "loss": 1.783, "step": 1137500 }, { "epoch": 66.5379891208984, "grad_norm": 0.2940210998058319, "learning_rate": 0.001, "loss": 1.778, "step": 1137600 }, { "epoch": 66.5438381002515, "grad_norm": 0.24003595113754272, "learning_rate": 0.001, "loss": 1.781, "step": 1137700 }, { "epoch": 66.54968707960461, "grad_norm": 0.23275388777256012, "learning_rate": 0.001, "loss": 1.7799, "step": 1137800 }, { "epoch": 66.55553605895771, "grad_norm": 0.2029484361410141, "learning_rate": 0.001, "loss": 1.7836, "step": 1137900 }, { "epoch": 66.56138503831082, "grad_norm": 0.16964943706989288, "learning_rate": 0.001, "loss": 1.7804, "step": 1138000 }, { "epoch": 66.56723401766392, "grad_norm": 0.18445731699466705, "learning_rate": 0.001, "loss": 1.7837, "step": 1138100 }, { "epoch": 66.57308299701702, "grad_norm": 0.2173728197813034, "learning_rate": 0.001, "loss": 1.7798, "step": 1138200 }, { "epoch": 66.57893197637013, "grad_norm": 0.21146650612354279, "learning_rate": 0.001, "loss": 1.7798, "step": 1138300 }, { "epoch": 66.58478095572323, "grad_norm": 0.1853681355714798, "learning_rate": 0.001, "loss": 1.7791, "step": 1138400 }, { "epoch": 66.59062993507634, "grad_norm": 0.31088849902153015, "learning_rate": 0.001, "loss": 1.7808, "step": 1138500 }, { "epoch": 66.59647891442943, "grad_norm": 0.22491779923439026, "learning_rate": 0.001, "loss": 1.7843, "step": 1138600 }, { "epoch": 66.60232789378253, "grad_norm": 0.19432123005390167, "learning_rate": 0.001, "loss": 1.7861, "step": 1138700 }, { "epoch": 66.60817687313563, "grad_norm": 0.1822137087583542, "learning_rate": 0.001, "loss": 1.7802, "step": 1138800 }, { "epoch": 66.61402585248874, "grad_norm": 0.194785937666893, "learning_rate": 0.001, "loss": 1.7843, "step": 1138900 }, { "epoch": 66.61987483184184, "grad_norm": 0.17546550929546356, "learning_rate": 0.001, "loss": 1.7855, "step": 1139000 }, { "epoch": 66.62572381119494, "grad_norm": 0.18681852519512177, "learning_rate": 0.001, "loss": 1.7852, "step": 1139100 }, { "epoch": 66.63157279054805, "grad_norm": 0.24835854768753052, "learning_rate": 0.001, "loss": 1.7805, "step": 1139200 }, { "epoch": 66.63742176990115, "grad_norm": 0.18811969459056854, "learning_rate": 0.001, "loss": 1.7808, "step": 1139300 }, { "epoch": 66.64327074925426, "grad_norm": 0.17803747951984406, "learning_rate": 0.001, "loss": 1.7784, "step": 1139400 }, { "epoch": 66.64911972860736, "grad_norm": 0.2127286195755005, "learning_rate": 0.001, "loss": 1.7815, "step": 1139500 }, { "epoch": 66.65496870796046, "grad_norm": 0.16435064375400543, "learning_rate": 0.001, "loss": 1.7866, "step": 1139600 }, { "epoch": 66.66081768731357, "grad_norm": 0.19875185191631317, "learning_rate": 0.001, "loss": 1.7829, "step": 1139700 }, { "epoch": 66.66666666666667, "grad_norm": 0.17956796288490295, "learning_rate": 0.001, "loss": 1.7806, "step": 1139800 }, { "epoch": 66.67251564601978, "grad_norm": 0.2306315302848816, "learning_rate": 0.001, "loss": 1.7776, "step": 1139900 }, { "epoch": 66.67836462537288, "grad_norm": 0.21892188489437103, "learning_rate": 0.001, "loss": 1.7841, "step": 1140000 }, { "epoch": 66.68421360472597, "grad_norm": 0.22645267844200134, "learning_rate": 0.001, "loss": 1.7755, "step": 1140100 }, { "epoch": 66.69006258407907, "grad_norm": 0.19199547171592712, "learning_rate": 0.001, "loss": 1.7829, "step": 1140200 }, { "epoch": 66.69591156343218, "grad_norm": 0.22354483604431152, "learning_rate": 0.001, "loss": 1.7895, "step": 1140300 }, { "epoch": 66.70176054278528, "grad_norm": 0.21251295506954193, "learning_rate": 0.001, "loss": 1.7853, "step": 1140400 }, { "epoch": 66.70760952213838, "grad_norm": 0.20605714619159698, "learning_rate": 0.001, "loss": 1.7855, "step": 1140500 }, { "epoch": 66.71345850149149, "grad_norm": 0.18113544583320618, "learning_rate": 0.001, "loss": 1.7815, "step": 1140600 }, { "epoch": 66.71930748084459, "grad_norm": 0.17549066245555878, "learning_rate": 0.001, "loss": 1.78, "step": 1140700 }, { "epoch": 66.7251564601977, "grad_norm": 0.22815097868442535, "learning_rate": 0.001, "loss": 1.7836, "step": 1140800 }, { "epoch": 66.7310054395508, "grad_norm": 0.20268891751766205, "learning_rate": 0.001, "loss": 1.7883, "step": 1140900 }, { "epoch": 66.7368544189039, "grad_norm": 0.1736437827348709, "learning_rate": 0.001, "loss": 1.7892, "step": 1141000 }, { "epoch": 66.742703398257, "grad_norm": 0.16865889728069305, "learning_rate": 0.001, "loss": 1.783, "step": 1141100 }, { "epoch": 66.74855237761011, "grad_norm": 0.18001538515090942, "learning_rate": 0.001, "loss": 1.7787, "step": 1141200 }, { "epoch": 66.75440135696321, "grad_norm": 0.204584002494812, "learning_rate": 0.001, "loss": 1.7823, "step": 1141300 }, { "epoch": 66.76025033631632, "grad_norm": 0.19801479578018188, "learning_rate": 0.001, "loss": 1.7826, "step": 1141400 }, { "epoch": 66.76609931566942, "grad_norm": 0.19302423298358917, "learning_rate": 0.001, "loss": 1.7814, "step": 1141500 }, { "epoch": 66.77194829502253, "grad_norm": 0.1687779724597931, "learning_rate": 0.001, "loss": 1.7841, "step": 1141600 }, { "epoch": 66.77779727437562, "grad_norm": 0.15594597160816193, "learning_rate": 0.001, "loss": 1.7793, "step": 1141700 }, { "epoch": 66.78364625372872, "grad_norm": 0.21348318457603455, "learning_rate": 0.001, "loss": 1.7835, "step": 1141800 }, { "epoch": 66.78949523308182, "grad_norm": 0.17853324115276337, "learning_rate": 0.001, "loss": 1.7809, "step": 1141900 }, { "epoch": 66.79534421243493, "grad_norm": 0.28145632147789, "learning_rate": 0.001, "loss": 1.783, "step": 1142000 }, { "epoch": 66.80119319178803, "grad_norm": 0.3025371730327606, "learning_rate": 0.001, "loss": 1.7787, "step": 1142100 }, { "epoch": 66.80704217114113, "grad_norm": 0.22160524129867554, "learning_rate": 0.001, "loss": 1.7852, "step": 1142200 }, { "epoch": 66.81289115049424, "grad_norm": 0.18084746599197388, "learning_rate": 0.001, "loss": 1.7772, "step": 1142300 }, { "epoch": 66.81874012984734, "grad_norm": 0.19884063303470612, "learning_rate": 0.001, "loss": 1.7826, "step": 1142400 }, { "epoch": 66.82458910920045, "grad_norm": 0.19958804547786713, "learning_rate": 0.001, "loss": 1.7838, "step": 1142500 }, { "epoch": 66.83043808855355, "grad_norm": 0.19520491361618042, "learning_rate": 0.001, "loss": 1.781, "step": 1142600 }, { "epoch": 66.83628706790665, "grad_norm": 0.16394992172718048, "learning_rate": 0.001, "loss": 1.7855, "step": 1142700 }, { "epoch": 66.84213604725976, "grad_norm": 0.18227307498455048, "learning_rate": 0.001, "loss": 1.784, "step": 1142800 }, { "epoch": 66.84798502661286, "grad_norm": 0.19480456411838531, "learning_rate": 0.001, "loss": 1.7853, "step": 1142900 }, { "epoch": 66.85383400596596, "grad_norm": 0.2661939859390259, "learning_rate": 0.001, "loss": 1.7841, "step": 1143000 }, { "epoch": 66.85968298531907, "grad_norm": 0.23840542137622833, "learning_rate": 0.001, "loss": 1.7819, "step": 1143100 }, { "epoch": 66.86553196467216, "grad_norm": 0.2288692593574524, "learning_rate": 0.001, "loss": 1.7825, "step": 1143200 }, { "epoch": 66.87138094402526, "grad_norm": 0.1722082942724228, "learning_rate": 0.001, "loss": 1.7819, "step": 1143300 }, { "epoch": 66.87722992337837, "grad_norm": 0.2324887067079544, "learning_rate": 0.001, "loss": 1.7852, "step": 1143400 }, { "epoch": 66.88307890273147, "grad_norm": 0.16456572711467743, "learning_rate": 0.001, "loss": 1.7782, "step": 1143500 }, { "epoch": 66.88892788208457, "grad_norm": 0.20292732119560242, "learning_rate": 0.001, "loss": 1.7813, "step": 1143600 }, { "epoch": 66.89477686143768, "grad_norm": 0.160222128033638, "learning_rate": 0.001, "loss": 1.7805, "step": 1143700 }, { "epoch": 66.90062584079078, "grad_norm": 0.18217213451862335, "learning_rate": 0.001, "loss": 1.78, "step": 1143800 }, { "epoch": 66.90647482014388, "grad_norm": 0.17182986438274384, "learning_rate": 0.001, "loss": 1.783, "step": 1143900 }, { "epoch": 66.91232379949699, "grad_norm": 0.18741126358509064, "learning_rate": 0.001, "loss": 1.7813, "step": 1144000 }, { "epoch": 66.91817277885009, "grad_norm": 0.1793399602174759, "learning_rate": 0.001, "loss": 1.7886, "step": 1144100 }, { "epoch": 66.9240217582032, "grad_norm": 0.18167060613632202, "learning_rate": 0.001, "loss": 1.7835, "step": 1144200 }, { "epoch": 66.9298707375563, "grad_norm": 0.17167748510837555, "learning_rate": 0.001, "loss": 1.7901, "step": 1144300 }, { "epoch": 66.9357197169094, "grad_norm": 0.18012607097625732, "learning_rate": 0.001, "loss": 1.7878, "step": 1144400 }, { "epoch": 66.94156869626251, "grad_norm": 0.15851101279258728, "learning_rate": 0.001, "loss": 1.7839, "step": 1144500 }, { "epoch": 66.94741767561561, "grad_norm": 0.20069155097007751, "learning_rate": 0.001, "loss": 1.782, "step": 1144600 }, { "epoch": 66.95326665496871, "grad_norm": 0.2449982464313507, "learning_rate": 0.001, "loss": 1.7914, "step": 1144700 }, { "epoch": 66.9591156343218, "grad_norm": 0.2435014396905899, "learning_rate": 0.001, "loss": 1.785, "step": 1144800 }, { "epoch": 66.96496461367491, "grad_norm": 0.1966622769832611, "learning_rate": 0.001, "loss": 1.7823, "step": 1144900 }, { "epoch": 66.97081359302801, "grad_norm": 0.3160291314125061, "learning_rate": 0.001, "loss": 1.7923, "step": 1145000 }, { "epoch": 66.97666257238112, "grad_norm": 0.24580524861812592, "learning_rate": 0.001, "loss": 1.7871, "step": 1145100 }, { "epoch": 66.98251155173422, "grad_norm": 0.1782066375017166, "learning_rate": 0.001, "loss": 1.7879, "step": 1145200 }, { "epoch": 66.98836053108732, "grad_norm": 0.18514366447925568, "learning_rate": 0.001, "loss": 1.7847, "step": 1145300 }, { "epoch": 66.99420951044043, "grad_norm": 0.19839830696582794, "learning_rate": 0.001, "loss": 1.7852, "step": 1145400 }, { "epoch": 67.00005848979353, "grad_norm": 0.17907556891441345, "learning_rate": 0.001, "loss": 1.7849, "step": 1145500 }, { "epoch": 67.00590746914663, "grad_norm": 0.1853916347026825, "learning_rate": 0.001, "loss": 1.773, "step": 1145600 }, { "epoch": 67.01175644849974, "grad_norm": 0.15416830778121948, "learning_rate": 0.001, "loss": 1.7664, "step": 1145700 }, { "epoch": 67.01760542785284, "grad_norm": 0.1850869506597519, "learning_rate": 0.001, "loss": 1.7704, "step": 1145800 }, { "epoch": 67.02345440720595, "grad_norm": 0.16691280901432037, "learning_rate": 0.001, "loss": 1.7684, "step": 1145900 }, { "epoch": 67.02930338655905, "grad_norm": 0.19612917304039001, "learning_rate": 0.001, "loss": 1.7758, "step": 1146000 }, { "epoch": 67.03515236591215, "grad_norm": 0.2823502719402313, "learning_rate": 0.001, "loss": 1.7701, "step": 1146100 }, { "epoch": 67.04100134526526, "grad_norm": 0.189975768327713, "learning_rate": 0.001, "loss": 1.7784, "step": 1146200 }, { "epoch": 67.04685032461835, "grad_norm": 0.2091708779335022, "learning_rate": 0.001, "loss": 1.7725, "step": 1146300 }, { "epoch": 67.05269930397145, "grad_norm": 0.15921634435653687, "learning_rate": 0.001, "loss": 1.7698, "step": 1146400 }, { "epoch": 67.05854828332455, "grad_norm": 0.18673178553581238, "learning_rate": 0.001, "loss": 1.7726, "step": 1146500 }, { "epoch": 67.06439726267766, "grad_norm": 0.20698395371437073, "learning_rate": 0.001, "loss": 1.7718, "step": 1146600 }, { "epoch": 67.07024624203076, "grad_norm": 0.13578134775161743, "learning_rate": 0.001, "loss": 1.7679, "step": 1146700 }, { "epoch": 67.07609522138387, "grad_norm": 0.1354006975889206, "learning_rate": 0.001, "loss": 1.7667, "step": 1146800 }, { "epoch": 67.08194420073697, "grad_norm": 0.17357712984085083, "learning_rate": 0.001, "loss": 1.7726, "step": 1146900 }, { "epoch": 67.08779318009007, "grad_norm": 0.21727392077445984, "learning_rate": 0.001, "loss": 1.7708, "step": 1147000 }, { "epoch": 67.09364215944318, "grad_norm": 0.14470742642879486, "learning_rate": 0.001, "loss": 1.7716, "step": 1147100 }, { "epoch": 67.09949113879628, "grad_norm": 0.1643034964799881, "learning_rate": 0.001, "loss": 1.7784, "step": 1147200 }, { "epoch": 67.10534011814939, "grad_norm": 0.2305614948272705, "learning_rate": 0.001, "loss": 1.7733, "step": 1147300 }, { "epoch": 67.11118909750249, "grad_norm": 0.16240453720092773, "learning_rate": 0.001, "loss": 1.7778, "step": 1147400 }, { "epoch": 67.1170380768556, "grad_norm": 0.12645716965198517, "learning_rate": 0.001, "loss": 1.7746, "step": 1147500 }, { "epoch": 67.1228870562087, "grad_norm": 0.1387002170085907, "learning_rate": 0.001, "loss": 1.7692, "step": 1147600 }, { "epoch": 67.1287360355618, "grad_norm": 0.16037966310977936, "learning_rate": 0.001, "loss": 1.7685, "step": 1147700 }, { "epoch": 67.1345850149149, "grad_norm": 0.24472033977508545, "learning_rate": 0.001, "loss": 1.7733, "step": 1147800 }, { "epoch": 67.140433994268, "grad_norm": 0.23969252407550812, "learning_rate": 0.001, "loss": 1.7698, "step": 1147900 }, { "epoch": 67.1462829736211, "grad_norm": 0.1910434365272522, "learning_rate": 0.001, "loss": 1.78, "step": 1148000 }, { "epoch": 67.1521319529742, "grad_norm": 0.15342016518115997, "learning_rate": 0.001, "loss": 1.7766, "step": 1148100 }, { "epoch": 67.1579809323273, "grad_norm": 0.19014813005924225, "learning_rate": 0.001, "loss": 1.7752, "step": 1148200 }, { "epoch": 67.16382991168041, "grad_norm": 0.29989805817604065, "learning_rate": 0.001, "loss": 1.7731, "step": 1148300 }, { "epoch": 67.16967889103351, "grad_norm": 0.20593948662281036, "learning_rate": 0.001, "loss": 1.7746, "step": 1148400 }, { "epoch": 67.17552787038662, "grad_norm": 0.14699596166610718, "learning_rate": 0.001, "loss": 1.7691, "step": 1148500 }, { "epoch": 67.18137684973972, "grad_norm": 0.1311284303665161, "learning_rate": 0.001, "loss": 1.7743, "step": 1148600 }, { "epoch": 67.18722582909282, "grad_norm": 0.18283922970294952, "learning_rate": 0.001, "loss": 1.7693, "step": 1148700 }, { "epoch": 67.19307480844593, "grad_norm": 0.21317873895168304, "learning_rate": 0.001, "loss": 1.7767, "step": 1148800 }, { "epoch": 67.19892378779903, "grad_norm": 0.2722693085670471, "learning_rate": 0.001, "loss": 1.7746, "step": 1148900 }, { "epoch": 67.20477276715214, "grad_norm": 0.2050480842590332, "learning_rate": 0.001, "loss": 1.775, "step": 1149000 }, { "epoch": 67.21062174650524, "grad_norm": 0.17266243696212769, "learning_rate": 0.001, "loss": 1.7773, "step": 1149100 }, { "epoch": 67.21647072585834, "grad_norm": 0.24601557850837708, "learning_rate": 0.001, "loss": 1.7758, "step": 1149200 }, { "epoch": 67.22231970521145, "grad_norm": 0.17080406844615936, "learning_rate": 0.001, "loss": 1.7748, "step": 1149300 }, { "epoch": 67.22816868456454, "grad_norm": 0.13947908580303192, "learning_rate": 0.001, "loss": 1.7663, "step": 1149400 }, { "epoch": 67.23401766391764, "grad_norm": 0.16007061302661896, "learning_rate": 0.001, "loss": 1.7754, "step": 1149500 }, { "epoch": 67.23986664327074, "grad_norm": 0.17255286872386932, "learning_rate": 0.001, "loss": 1.7738, "step": 1149600 }, { "epoch": 67.24571562262385, "grad_norm": 0.16694432497024536, "learning_rate": 0.001, "loss": 1.7782, "step": 1149700 }, { "epoch": 67.25156460197695, "grad_norm": 0.21070092916488647, "learning_rate": 0.001, "loss": 1.7767, "step": 1149800 }, { "epoch": 67.25741358133006, "grad_norm": 0.1672658622264862, "learning_rate": 0.001, "loss": 1.7698, "step": 1149900 }, { "epoch": 67.26326256068316, "grad_norm": 0.2666601538658142, "learning_rate": 0.001, "loss": 1.7739, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_ag_news_accuracy": 0.24396875, "eval_ag_news_bleu_score": 8.244117715669942, "eval_ag_news_bleu_score_sem": 0.6031573752567424, "eval_ag_news_emb_cos_sim": 0.7362315654754639, "eval_ag_news_emb_cos_sim_sem": 0.012788387015461922, "eval_ag_news_emb_top1_equal": 0.9453125, "eval_ag_news_emb_top1_equal_sem": 0.020175758749246597, "eval_ag_news_exact_match": 0.0, "eval_ag_news_exact_match_sem": 0.0, "eval_ag_news_loss": 2.6798417568206787, "eval_ag_news_n_ngrams_match_1": 14.8671875, "eval_ag_news_n_ngrams_match_2": 4.4375, "eval_ag_news_n_ngrams_match_3": 1.8671875, "eval_ag_news_num_pred_words": 46.171875, "eval_ag_news_num_true_words": 44.7109375, "eval_ag_news_perplexity": 14.582785486949279, "eval_ag_news_pred_num_tokens": 68.2734375, "eval_ag_news_rouge_score": 0.31395957509797706, "eval_ag_news_runtime": 41.0559, "eval_ag_news_samples_per_second": 12.179, "eval_ag_news_steps_per_second": 0.024, "eval_ag_news_token_set_f1": 0.34511284751580573, "eval_ag_news_token_set_f1_sem": 0.010472927758922052, "eval_ag_news_token_set_precision": 0.33200051897025845, "eval_ag_news_token_set_recall": 0.3649250884460414, "eval_ag_news_true_num_tokens": 61.5625, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_anthropic_toxic_prompts_accuracy": 0.103375, "eval_anthropic_toxic_prompts_bleu_score": 43.055055946946624, "eval_anthropic_toxic_prompts_bleu_score_sem": 2.755446806415936, "eval_anthropic_toxic_prompts_emb_cos_sim": 0.9004063010215759, "eval_anthropic_toxic_prompts_emb_cos_sim_sem": 0.009951808489859104, "eval_anthropic_toxic_prompts_emb_top1_equal": 1.0, "eval_anthropic_toxic_prompts_emb_top1_equal_sem": 0.0, "eval_anthropic_toxic_prompts_exact_match": 0.15625, "eval_anthropic_toxic_prompts_exact_match_sem": 0.03221922171672519, "eval_anthropic_toxic_prompts_loss": 1.2742414474487305, "eval_anthropic_toxic_prompts_n_ngrams_match_1": 9.3046875, "eval_anthropic_toxic_prompts_n_ngrams_match_2": 5.703125, "eval_anthropic_toxic_prompts_n_ngrams_match_3": 3.65625, "eval_anthropic_toxic_prompts_num_pred_words": 15.7578125, "eval_anthropic_toxic_prompts_num_true_words": 14.984375, "eval_anthropic_toxic_prompts_perplexity": 3.575987806352356, "eval_anthropic_toxic_prompts_pred_num_tokens": 21.1953125, "eval_anthropic_toxic_prompts_rouge_score": 0.6891709540786988, "eval_anthropic_toxic_prompts_runtime": 38.2526, "eval_anthropic_toxic_prompts_samples_per_second": 13.071, "eval_anthropic_toxic_prompts_steps_per_second": 0.026, "eval_anthropic_toxic_prompts_token_set_f1": 0.6990922042923402, "eval_anthropic_toxic_prompts_token_set_f1_sem": 0.018825082112044804, "eval_anthropic_toxic_prompts_token_set_precision": 0.7034993375294369, "eval_anthropic_toxic_prompts_token_set_recall": 0.7024821446027898, "eval_anthropic_toxic_prompts_true_num_tokens": 18.2421875, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_arxiv_accuracy": 0.375921875, "eval_arxiv_bleu_score": 1.6896977843411642, "eval_arxiv_bleu_score_sem": 0.13518439687881004, "eval_arxiv_emb_cos_sim": 0.48277023434638977, "eval_arxiv_emb_cos_sim_sem": 0.018522942438721657, "eval_arxiv_emb_top1_equal": 0.90625, "eval_arxiv_emb_top1_equal_sem": 0.025864720344543457, "eval_arxiv_exact_match": 0.0, "eval_arxiv_exact_match_sem": 0.0, "eval_arxiv_loss": 3.409252166748047, "eval_arxiv_n_ngrams_match_1": 13.578125, "eval_arxiv_n_ngrams_match_2": 2.40625, "eval_arxiv_n_ngrams_match_3": 0.46875, "eval_arxiv_num_pred_words": 57.1015625, "eval_arxiv_num_true_words": 86.4453125, "eval_arxiv_perplexity": 30.2426193642433, "eval_arxiv_pred_num_tokens": 126.171875, "eval_arxiv_rouge_score": 0.17696896715339966, "eval_arxiv_runtime": 32.9757, "eval_arxiv_samples_per_second": 15.163, "eval_arxiv_steps_per_second": 0.03, "eval_arxiv_token_set_f1": 0.18035779543458536, "eval_arxiv_token_set_f1_sem": 0.007770380483505059, "eval_arxiv_token_set_precision": 0.12248669735159466, "eval_arxiv_token_set_recall": 0.41031724508128975, "eval_arxiv_true_num_tokens": 125.5234375, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_python_code_alpaca_accuracy": 0.130609375, "eval_python_code_alpaca_bleu_score": 32.00020096052398, "eval_python_code_alpaca_bleu_score_sem": 1.866737656900792, "eval_python_code_alpaca_emb_cos_sim": 0.885143518447876, "eval_python_code_alpaca_emb_cos_sim_sem": 0.008294115774333477, "eval_python_code_alpaca_emb_top1_equal": 1.0, "eval_python_code_alpaca_emb_top1_equal_sem": 0.0, "eval_python_code_alpaca_exact_match": 0.0234375, "eval_python_code_alpaca_exact_match_sem": 0.013424675715302162, "eval_python_code_alpaca_loss": 1.4569200277328491, "eval_python_code_alpaca_n_ngrams_match_1": 11.5234375, "eval_python_code_alpaca_n_ngrams_match_2": 6.59375, "eval_python_code_alpaca_n_ngrams_match_3": 3.8984375, "eval_python_code_alpaca_num_pred_words": 18.578125, "eval_python_code_alpaca_num_true_words": 19.53125, "eval_python_code_alpaca_perplexity": 4.292717695121897, "eval_python_code_alpaca_pred_num_tokens": 23.796875, "eval_python_code_alpaca_rouge_score": 0.6308042368506582, "eval_python_code_alpaca_runtime": 32.6248, "eval_python_code_alpaca_samples_per_second": 15.326, "eval_python_code_alpaca_steps_per_second": 0.031, "eval_python_code_alpaca_token_set_f1": 0.6473011251916272, "eval_python_code_alpaca_token_set_f1_sem": 0.014079105823830356, "eval_python_code_alpaca_token_set_precision": 0.6315778347512399, "eval_python_code_alpaca_token_set_recall": 0.6670551652880692, "eval_python_code_alpaca_true_num_tokens": 24.8125, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_wikibio_accuracy": 0.37115625, "eval_wikibio_bleu_score": 7.49313469486184, "eval_wikibio_bleu_score_sem": 0.7091835548009827, "eval_wikibio_emb_cos_sim": 0.6268566846847534, "eval_wikibio_emb_cos_sim_sem": 0.022281242534518242, "eval_wikibio_emb_top1_equal": 0.9296875, "eval_wikibio_emb_top1_equal_sem": 0.022687306627631187, "eval_wikibio_exact_match": 0.0, "eval_wikibio_exact_match_sem": 0.0, "eval_wikibio_loss": 2.6958906650543213, "eval_wikibio_n_ngrams_match_1": 15.421875, "eval_wikibio_n_ngrams_match_2": 5.265625, "eval_wikibio_n_ngrams_match_3": 2.2578125, "eval_wikibio_num_pred_words": 52.1484375, "eval_wikibio_num_true_words": 52.8671875, "eval_wikibio_perplexity": 14.818711385688578, "eval_wikibio_pred_num_tokens": 106.796875, "eval_wikibio_rouge_score": 0.2996950241819345, "eval_wikibio_runtime": 34.5719, "eval_wikibio_samples_per_second": 14.463, "eval_wikibio_steps_per_second": 0.029, "eval_wikibio_token_set_f1": 0.3181066639841368, "eval_wikibio_token_set_f1_sem": 0.012890488482391597, "eval_wikibio_token_set_precision": 0.2835278759349723, "eval_wikibio_token_set_recall": 0.4018807741138027, "eval_wikibio_true_num_tokens": 100.4375, "step": 1150000 }, { "epoch": 67.26326256068316, "eval_msmarco_accuracy": 0.405015625, "eval_msmarco_bleu_score": 18.62918593031269, "eval_msmarco_bleu_score_sem": 1.65786416184323, "eval_msmarco_emb_cos_sim": 0.7961995005607605, "eval_msmarco_emb_cos_sim_sem": 0.0158347487449646, "eval_msmarco_emb_top1_equal": 0.9453125, "eval_msmarco_emb_top1_equal_sem": 0.020175758749246597, "eval_msmarco_exact_match": 0.0, "eval_msmarco_exact_match_sem": 0.0, "eval_msmarco_loss": 1.6814647912979126, "eval_msmarco_n_ngrams_match_1": 29.8984375, "eval_msmarco_n_ngrams_match_2": 14.1640625, "eval_msmarco_n_ngrams_match_3": 8.1796875, "eval_msmarco_num_pred_words": 62.359375, "eval_msmarco_num_true_words": 63.265625, "eval_msmarco_perplexity": 5.373421149833301, "eval_msmarco_pred_num_tokens": 85.8671875, "eval_msmarco_rouge_score": 0.4551451387532367, "eval_msmarco_runtime": 31.6991, "eval_msmarco_samples_per_second": 15.773, "eval_msmarco_steps_per_second": 0.032, "eval_msmarco_token_set_f1": 0.4784064823360562, "eval_msmarco_token_set_f1_sem": 0.015146263693995961, "eval_msmarco_token_set_precision": 0.44627800631379555, "eval_msmarco_token_set_recall": 0.5328942924467254, "eval_msmarco_true_num_tokens": 82.0, "step": 1150000 }, { "epoch": 67.26911154003626, "grad_norm": 0.1978638768196106, "learning_rate": 0.001, "loss": 1.7755, "step": 1150100 }, { "epoch": 67.27496051938937, "grad_norm": 0.21636711061000824, "learning_rate": 0.001, "loss": 1.7812, "step": 1150200 }, { "epoch": 67.28080949874247, "grad_norm": 0.22236822545528412, "learning_rate": 0.001, "loss": 1.7738, "step": 1150300 }, { "epoch": 67.28665847809557, "grad_norm": 0.16651611030101776, "learning_rate": 0.001, "loss": 1.7726, "step": 1150400 }, { "epoch": 67.29250745744868, "grad_norm": 0.1819765567779541, "learning_rate": 0.001, "loss": 1.7787, "step": 1150500 }, { "epoch": 67.29835643680178, "grad_norm": 0.17574618756771088, "learning_rate": 0.001, "loss": 1.7731, "step": 1150600 }, { "epoch": 67.30420541615489, "grad_norm": 0.30779334902763367, "learning_rate": 0.001, "loss": 1.7762, "step": 1150700 }, { "epoch": 67.31005439550799, "grad_norm": 0.23903882503509521, "learning_rate": 0.001, "loss": 1.7895, "step": 1150800 }, { "epoch": 67.3159033748611, "grad_norm": 0.16397501528263092, "learning_rate": 0.001, "loss": 1.7708, "step": 1150900 }, { "epoch": 67.32175235421418, "grad_norm": 0.1511811763048172, "learning_rate": 0.001, "loss": 1.7737, "step": 1151000 }, { "epoch": 67.32760133356729, "grad_norm": 0.1477344036102295, "learning_rate": 0.001, "loss": 1.779, "step": 1151100 }, { "epoch": 67.33345031292039, "grad_norm": 0.17269977927207947, "learning_rate": 0.001, "loss": 1.7789, "step": 1151200 }, { "epoch": 67.3392992922735, "grad_norm": 0.24543453752994537, "learning_rate": 0.001, "loss": 1.7758, "step": 1151300 }, { "epoch": 67.3451482716266, "grad_norm": 0.17844435572624207, "learning_rate": 0.001, "loss": 1.7796, "step": 1151400 }, { "epoch": 67.3509972509797, "grad_norm": 0.21282844245433807, "learning_rate": 0.001, "loss": 1.7728, "step": 1151500 }, { "epoch": 67.3568462303328, "grad_norm": 0.14322690665721893, "learning_rate": 0.001, "loss": 1.7794, "step": 1151600 }, { "epoch": 67.36269520968591, "grad_norm": 0.14044266939163208, "learning_rate": 0.001, "loss": 1.7763, "step": 1151700 }, { "epoch": 67.36854418903901, "grad_norm": 0.167304128408432, "learning_rate": 0.001, "loss": 1.78, "step": 1151800 }, { "epoch": 67.37439316839212, "grad_norm": 0.14325405657291412, "learning_rate": 0.001, "loss": 1.7812, "step": 1151900 }, { "epoch": 67.38024214774522, "grad_norm": 0.23149695992469788, "learning_rate": 0.001, "loss": 1.7788, "step": 1152000 }, { "epoch": 67.38609112709833, "grad_norm": 0.17950168251991272, "learning_rate": 0.001, "loss": 1.7789, "step": 1152100 }, { "epoch": 67.39194010645143, "grad_norm": 0.26895788311958313, "learning_rate": 0.001, "loss": 1.778, "step": 1152200 }, { "epoch": 67.39778908580453, "grad_norm": 0.3610552251338959, "learning_rate": 0.001, "loss": 1.7817, "step": 1152300 }, { "epoch": 67.40363806515764, "grad_norm": 0.18123771250247955, "learning_rate": 0.001, "loss": 1.7809, "step": 1152400 }, { "epoch": 67.40948704451073, "grad_norm": 0.14914944767951965, "learning_rate": 0.001, "loss": 1.7802, "step": 1152500 }, { "epoch": 67.41533602386383, "grad_norm": 0.15390875935554504, "learning_rate": 0.001, "loss": 1.7771, "step": 1152600 }, { "epoch": 67.42118500321693, "grad_norm": 0.18077881634235382, "learning_rate": 0.001, "loss": 1.7723, "step": 1152700 }, { "epoch": 67.42703398257004, "grad_norm": 0.16372588276863098, "learning_rate": 0.001, "loss": 1.7824, "step": 1152800 }, { "epoch": 67.43288296192314, "grad_norm": 0.17176608741283417, "learning_rate": 0.001, "loss": 1.7779, "step": 1152900 }, { "epoch": 67.43873194127625, "grad_norm": 0.11946366727352142, "learning_rate": 0.001, "loss": 1.7772, "step": 1153000 }, { "epoch": 67.44458092062935, "grad_norm": 0.2017696052789688, "learning_rate": 0.001, "loss": 1.7729, "step": 1153100 }, { "epoch": 67.45042989998245, "grad_norm": 0.16389019787311554, "learning_rate": 0.001, "loss": 1.7797, "step": 1153200 }, { "epoch": 67.45627887933556, "grad_norm": 0.18015733361244202, "learning_rate": 0.001, "loss": 1.7771, "step": 1153300 }, { "epoch": 67.46212785868866, "grad_norm": 0.19853074848651886, "learning_rate": 0.001, "loss": 1.7802, "step": 1153400 }, { "epoch": 67.46797683804176, "grad_norm": 0.2715133726596832, "learning_rate": 0.001, "loss": 1.7797, "step": 1153500 }, { "epoch": 67.47382581739487, "grad_norm": 0.19996827840805054, "learning_rate": 0.001, "loss": 1.7739, "step": 1153600 }, { "epoch": 67.47967479674797, "grad_norm": 0.16931867599487305, "learning_rate": 0.001, "loss": 1.7752, "step": 1153700 }, { "epoch": 67.48552377610108, "grad_norm": 0.15621981024742126, "learning_rate": 0.001, "loss": 1.7771, "step": 1153800 }, { "epoch": 67.49137275545418, "grad_norm": 0.14693434536457062, "learning_rate": 0.001, "loss": 1.7808, "step": 1153900 }, { "epoch": 67.49722173480728, "grad_norm": 0.15026795864105225, "learning_rate": 0.001, "loss": 1.7793, "step": 1154000 }, { "epoch": 67.50307071416037, "grad_norm": 0.15091748535633087, "learning_rate": 0.001, "loss": 1.7719, "step": 1154100 }, { "epoch": 67.50891969351348, "grad_norm": 0.14558903872966766, "learning_rate": 0.001, "loss": 1.7823, "step": 1154200 }, { "epoch": 67.51476867286658, "grad_norm": 0.20901405811309814, "learning_rate": 0.001, "loss": 1.7786, "step": 1154300 }, { "epoch": 67.52061765221968, "grad_norm": 0.20533423125743866, "learning_rate": 0.001, "loss": 1.785, "step": 1154400 }, { "epoch": 67.52646663157279, "grad_norm": 0.1532546579837799, "learning_rate": 0.001, "loss": 1.775, "step": 1154500 }, { "epoch": 67.53231561092589, "grad_norm": 0.19892729818820953, "learning_rate": 0.001, "loss": 1.7839, "step": 1154600 }, { "epoch": 67.538164590279, "grad_norm": 0.19239366054534912, "learning_rate": 0.001, "loss": 1.778, "step": 1154700 }, { "epoch": 67.5440135696321, "grad_norm": 0.15724000334739685, "learning_rate": 0.001, "loss": 1.7831, "step": 1154800 }, { "epoch": 67.5498625489852, "grad_norm": 0.19453515112400055, "learning_rate": 0.001, "loss": 1.7794, "step": 1154900 }, { "epoch": 67.5557115283383, "grad_norm": 0.23443293571472168, "learning_rate": 0.001, "loss": 1.7807, "step": 1155000 }, { "epoch": 67.56156050769141, "grad_norm": 0.17972606420516968, "learning_rate": 0.001, "loss": 1.7851, "step": 1155100 }, { "epoch": 67.56740948704451, "grad_norm": 0.13310223817825317, "learning_rate": 0.001, "loss": 1.7793, "step": 1155200 }, { "epoch": 67.57325846639762, "grad_norm": 0.25117024779319763, "learning_rate": 0.001, "loss": 1.7772, "step": 1155300 }, { "epoch": 67.57910744575072, "grad_norm": 0.15559564530849457, "learning_rate": 0.001, "loss": 1.7784, "step": 1155400 }, { "epoch": 67.58495642510383, "grad_norm": 0.18404994904994965, "learning_rate": 0.001, "loss": 1.781, "step": 1155500 }, { "epoch": 67.59080540445692, "grad_norm": 0.18765440583229065, "learning_rate": 0.001, "loss": 1.7799, "step": 1155600 }, { "epoch": 67.59665438381002, "grad_norm": 0.1271800547838211, "learning_rate": 0.001, "loss": 1.7855, "step": 1155700 }, { "epoch": 67.60250336316312, "grad_norm": 0.21248021721839905, "learning_rate": 0.001, "loss": 1.7797, "step": 1155800 }, { "epoch": 67.60835234251623, "grad_norm": 0.13941466808319092, "learning_rate": 0.001, "loss": 1.7811, "step": 1155900 }, { "epoch": 67.61420132186933, "grad_norm": 0.21550464630126953, "learning_rate": 0.001, "loss": 1.7829, "step": 1156000 }, { "epoch": 67.62005030122243, "grad_norm": 0.16157029569149017, "learning_rate": 0.001, "loss": 1.7797, "step": 1156100 }, { "epoch": 67.62589928057554, "grad_norm": 0.18792125582695007, "learning_rate": 0.001, "loss": 1.7799, "step": 1156200 }, { "epoch": 67.63174825992864, "grad_norm": 0.14156943559646606, "learning_rate": 0.001, "loss": 1.7859, "step": 1156300 }, { "epoch": 67.63759723928175, "grad_norm": 0.1384029984474182, "learning_rate": 0.001, "loss": 1.7841, "step": 1156400 }, { "epoch": 67.64344621863485, "grad_norm": 0.14908719062805176, "learning_rate": 0.001, "loss": 1.7738, "step": 1156500 }, { "epoch": 67.64929519798795, "grad_norm": 0.1959228217601776, "learning_rate": 0.001, "loss": 1.7806, "step": 1156600 }, { "epoch": 67.65514417734106, "grad_norm": 0.1596013605594635, "learning_rate": 0.001, "loss": 1.7797, "step": 1156700 }, { "epoch": 67.66099315669416, "grad_norm": 0.1349838227033615, "learning_rate": 0.001, "loss": 1.7827, "step": 1156800 }, { "epoch": 67.66684213604726, "grad_norm": 0.2351255714893341, "learning_rate": 0.001, "loss": 1.7804, "step": 1156900 }, { "epoch": 67.67269111540037, "grad_norm": 0.17307935655117035, "learning_rate": 0.001, "loss": 1.7799, "step": 1157000 }, { "epoch": 67.67854009475347, "grad_norm": 0.1603792905807495, "learning_rate": 0.001, "loss": 1.7813, "step": 1157100 }, { "epoch": 67.68438907410656, "grad_norm": 0.19401560723781586, "learning_rate": 0.001, "loss": 1.777, "step": 1157200 }, { "epoch": 67.69023805345967, "grad_norm": 0.18409128487110138, "learning_rate": 0.001, "loss": 1.7823, "step": 1157300 }, { "epoch": 67.69608703281277, "grad_norm": 0.17197392880916595, "learning_rate": 0.001, "loss": 1.7772, "step": 1157400 }, { "epoch": 67.70193601216587, "grad_norm": 0.15920016169548035, "learning_rate": 0.001, "loss": 1.785, "step": 1157500 }, { "epoch": 67.70778499151898, "grad_norm": 0.15405800938606262, "learning_rate": 0.001, "loss": 1.7818, "step": 1157600 }, { "epoch": 67.71363397087208, "grad_norm": 0.30272194743156433, "learning_rate": 0.001, "loss": 1.7779, "step": 1157700 }, { "epoch": 67.71948295022518, "grad_norm": 0.22489388287067413, "learning_rate": 0.001, "loss": 1.7817, "step": 1157800 }, { "epoch": 67.72533192957829, "grad_norm": 0.20573028922080994, "learning_rate": 0.001, "loss": 1.7823, "step": 1157900 }, { "epoch": 67.73118090893139, "grad_norm": 0.22896213829517365, "learning_rate": 0.001, "loss": 1.7831, "step": 1158000 }, { "epoch": 67.7370298882845, "grad_norm": 0.13499881327152252, "learning_rate": 0.001, "loss": 1.7767, "step": 1158100 }, { "epoch": 67.7428788676376, "grad_norm": 0.16002511978149414, "learning_rate": 0.001, "loss": 1.7822, "step": 1158200 }, { "epoch": 67.7487278469907, "grad_norm": 0.15839998424053192, "learning_rate": 0.001, "loss": 1.7746, "step": 1158300 }, { "epoch": 67.75457682634381, "grad_norm": 0.31233468651771545, "learning_rate": 0.001, "loss": 1.7815, "step": 1158400 }, { "epoch": 67.76042580569691, "grad_norm": 0.16241402924060822, "learning_rate": 0.001, "loss": 1.7805, "step": 1158500 }, { "epoch": 67.76627478505002, "grad_norm": 0.18594558537006378, "learning_rate": 0.001, "loss": 1.7853, "step": 1158600 }, { "epoch": 67.7721237644031, "grad_norm": 0.1900181919336319, "learning_rate": 0.001, "loss": 1.7818, "step": 1158700 }, { "epoch": 67.77797274375621, "grad_norm": 0.17585083842277527, "learning_rate": 0.001, "loss": 1.7877, "step": 1158800 }, { "epoch": 67.78382172310931, "grad_norm": 0.20134088397026062, "learning_rate": 0.001, "loss": 1.7858, "step": 1158900 }, { "epoch": 67.78967070246242, "grad_norm": 0.15764927864074707, "learning_rate": 0.001, "loss": 1.7821, "step": 1159000 }, { "epoch": 67.79551968181552, "grad_norm": 0.1727159321308136, "learning_rate": 0.001, "loss": 1.7787, "step": 1159100 }, { "epoch": 67.80136866116862, "grad_norm": 0.14773185551166534, "learning_rate": 0.001, "loss": 1.7859, "step": 1159200 }, { "epoch": 67.80721764052173, "grad_norm": 0.22444553673267365, "learning_rate": 0.001, "loss": 1.7768, "step": 1159300 }, { "epoch": 67.81306661987483, "grad_norm": 0.14633485674858093, "learning_rate": 0.001, "loss": 1.7855, "step": 1159400 }, { "epoch": 67.81891559922794, "grad_norm": 0.19286178052425385, "learning_rate": 0.001, "loss": 1.7824, "step": 1159500 }, { "epoch": 67.82476457858104, "grad_norm": 0.2466661036014557, "learning_rate": 0.001, "loss": 1.7808, "step": 1159600 }, { "epoch": 67.83061355793414, "grad_norm": 0.2050388902425766, "learning_rate": 0.001, "loss": 1.7775, "step": 1159700 }, { "epoch": 67.83646253728725, "grad_norm": 0.15320248901844025, "learning_rate": 0.001, "loss": 1.7807, "step": 1159800 }, { "epoch": 67.84231151664035, "grad_norm": 0.26028022170066833, "learning_rate": 0.001, "loss": 1.7832, "step": 1159900 }, { "epoch": 67.84816049599345, "grad_norm": 0.1513073444366455, "learning_rate": 0.001, "loss": 1.7839, "step": 1160000 }, { "epoch": 67.85400947534656, "grad_norm": 0.20229899883270264, "learning_rate": 0.001, "loss": 1.7805, "step": 1160100 }, { "epoch": 67.85985845469966, "grad_norm": 0.20206694304943085, "learning_rate": 0.001, "loss": 1.7845, "step": 1160200 }, { "epoch": 67.86570743405275, "grad_norm": 0.15397047996520996, "learning_rate": 0.001, "loss": 1.7853, "step": 1160300 }, { "epoch": 67.87155641340586, "grad_norm": 0.1773584634065628, "learning_rate": 0.001, "loss": 1.7858, "step": 1160400 }, { "epoch": 67.87740539275896, "grad_norm": 0.15100525319576263, "learning_rate": 0.001, "loss": 1.7768, "step": 1160500 }, { "epoch": 67.88325437211206, "grad_norm": 0.18742971122264862, "learning_rate": 0.001, "loss": 1.7841, "step": 1160600 }, { "epoch": 67.88910335146517, "grad_norm": 0.23368512094020844, "learning_rate": 0.001, "loss": 1.7768, "step": 1160700 }, { "epoch": 67.89495233081827, "grad_norm": 0.2058468461036682, "learning_rate": 0.001, "loss": 1.7811, "step": 1160800 }, { "epoch": 67.90080131017137, "grad_norm": 0.17846785485744476, "learning_rate": 0.001, "loss": 1.7866, "step": 1160900 }, { "epoch": 67.90665028952448, "grad_norm": 0.17832361161708832, "learning_rate": 0.001, "loss": 1.7789, "step": 1161000 }, { "epoch": 67.91249926887758, "grad_norm": 0.17903918027877808, "learning_rate": 0.001, "loss": 1.7778, "step": 1161100 }, { "epoch": 67.91834824823069, "grad_norm": 0.14733782410621643, "learning_rate": 0.001, "loss": 1.7794, "step": 1161200 }, { "epoch": 67.92419722758379, "grad_norm": 0.2063792645931244, "learning_rate": 0.001, "loss": 1.781, "step": 1161300 }, { "epoch": 67.9300462069369, "grad_norm": 0.20042340457439423, "learning_rate": 0.001, "loss": 1.7818, "step": 1161400 }, { "epoch": 67.93589518629, "grad_norm": 0.19543662667274475, "learning_rate": 0.001, "loss": 1.7822, "step": 1161500 }, { "epoch": 67.9417441656431, "grad_norm": 0.1589260995388031, "learning_rate": 0.001, "loss": 1.783, "step": 1161600 }, { "epoch": 67.9475931449962, "grad_norm": 0.14331263303756714, "learning_rate": 0.001, "loss": 1.7806, "step": 1161700 }, { "epoch": 67.9534421243493, "grad_norm": 0.15746720135211945, "learning_rate": 0.001, "loss": 1.7855, "step": 1161800 }, { "epoch": 67.9592911037024, "grad_norm": 0.1995401829481125, "learning_rate": 0.001, "loss": 1.7742, "step": 1161900 }, { "epoch": 67.9651400830555, "grad_norm": 0.18856152892112732, "learning_rate": 0.001, "loss": 1.7866, "step": 1162000 }, { "epoch": 67.9709890624086, "grad_norm": 0.17737333476543427, "learning_rate": 0.001, "loss": 1.7811, "step": 1162100 }, { "epoch": 67.97683804176171, "grad_norm": 0.15646755695343018, "learning_rate": 0.001, "loss": 1.784, "step": 1162200 }, { "epoch": 67.98268702111481, "grad_norm": 0.259946346282959, "learning_rate": 0.001, "loss": 1.7835, "step": 1162300 }, { "epoch": 67.98853600046792, "grad_norm": 0.13136452436447144, "learning_rate": 0.001, "loss": 1.7771, "step": 1162400 }, { "epoch": 67.99438497982102, "grad_norm": 0.2528892755508423, "learning_rate": 0.001, "loss": 1.7823, "step": 1162500 }, { "epoch": 68.00023395917412, "grad_norm": 0.19519604742527008, "learning_rate": 0.001, "loss": 1.7915, "step": 1162600 }, { "epoch": 68.00608293852723, "grad_norm": 0.14121782779693604, "learning_rate": 0.001, "loss": 1.7671, "step": 1162700 }, { "epoch": 68.01193191788033, "grad_norm": 0.23543404042720795, "learning_rate": 0.001, "loss": 1.7674, "step": 1162800 }, { "epoch": 68.01778089723344, "grad_norm": 0.20095615088939667, "learning_rate": 0.001, "loss": 1.7738, "step": 1162900 }, { "epoch": 68.02362987658654, "grad_norm": 0.27664387226104736, "learning_rate": 0.001, "loss": 1.7692, "step": 1163000 }, { "epoch": 68.02947885593964, "grad_norm": 0.1478748768568039, "learning_rate": 0.001, "loss": 1.77, "step": 1163100 }, { "epoch": 68.03532783529275, "grad_norm": 0.19249805808067322, "learning_rate": 0.001, "loss": 1.7706, "step": 1163200 }, { "epoch": 68.04117681464585, "grad_norm": 0.1601804494857788, "learning_rate": 0.001, "loss": 1.7764, "step": 1163300 }, { "epoch": 68.04702579399894, "grad_norm": 0.20536425709724426, "learning_rate": 0.001, "loss": 1.7698, "step": 1163400 }, { "epoch": 68.05287477335204, "grad_norm": 0.16941308975219727, "learning_rate": 0.001, "loss": 1.7693, "step": 1163500 }, { "epoch": 68.05872375270515, "grad_norm": 0.21628925204277039, "learning_rate": 0.001, "loss": 1.7671, "step": 1163600 }, { "epoch": 68.06457273205825, "grad_norm": 0.19683878123760223, "learning_rate": 0.001, "loss": 1.77, "step": 1163700 }, { "epoch": 68.07042171141136, "grad_norm": 0.21107003092765808, "learning_rate": 0.001, "loss": 1.766, "step": 1163800 }, { "epoch": 68.07627069076446, "grad_norm": 0.14083629846572876, "learning_rate": 0.001, "loss": 1.7728, "step": 1163900 }, { "epoch": 68.08211967011756, "grad_norm": 0.1572033166885376, "learning_rate": 0.001, "loss": 1.7724, "step": 1164000 }, { "epoch": 68.08796864947067, "grad_norm": 0.14910361170768738, "learning_rate": 0.001, "loss": 1.7728, "step": 1164100 }, { "epoch": 68.09381762882377, "grad_norm": 0.18646781146526337, "learning_rate": 0.001, "loss": 1.7678, "step": 1164200 }, { "epoch": 68.09966660817688, "grad_norm": 0.17057068645954132, "learning_rate": 0.001, "loss": 1.7716, "step": 1164300 }, { "epoch": 68.10551558752998, "grad_norm": 0.12301348894834518, "learning_rate": 0.001, "loss": 1.7715, "step": 1164400 }, { "epoch": 68.11136456688308, "grad_norm": 0.1399354487657547, "learning_rate": 0.001, "loss": 1.7779, "step": 1164500 }, { "epoch": 68.11721354623619, "grad_norm": 0.15687496960163116, "learning_rate": 0.001, "loss": 1.7692, "step": 1164600 }, { "epoch": 68.12306252558929, "grad_norm": 0.2192431539297104, "learning_rate": 0.001, "loss": 1.7745, "step": 1164700 }, { "epoch": 68.1289115049424, "grad_norm": 0.15930145978927612, "learning_rate": 0.001, "loss": 1.7751, "step": 1164800 }, { "epoch": 68.13476048429548, "grad_norm": 0.16408687829971313, "learning_rate": 0.001, "loss": 1.7696, "step": 1164900 }, { "epoch": 68.14060946364859, "grad_norm": 0.21872174739837646, "learning_rate": 0.001, "loss": 1.7695, "step": 1165000 }, { "epoch": 68.14645844300169, "grad_norm": 0.22161157429218292, "learning_rate": 0.001, "loss": 1.7719, "step": 1165100 }, { "epoch": 68.1523074223548, "grad_norm": 0.16609780490398407, "learning_rate": 0.001, "loss": 1.7697, "step": 1165200 }, { "epoch": 68.1581564017079, "grad_norm": 0.20217852294445038, "learning_rate": 0.001, "loss": 1.7758, "step": 1165300 }, { "epoch": 68.164005381061, "grad_norm": 0.1905067414045334, "learning_rate": 0.001, "loss": 1.7718, "step": 1165400 }, { "epoch": 68.1698543604141, "grad_norm": 0.18980245292186737, "learning_rate": 0.001, "loss": 1.7829, "step": 1165500 }, { "epoch": 68.17570333976721, "grad_norm": 0.2689572870731354, "learning_rate": 0.001, "loss": 1.7748, "step": 1165600 }, { "epoch": 68.18155231912031, "grad_norm": 0.19226418435573578, "learning_rate": 0.001, "loss": 1.7685, "step": 1165700 }, { "epoch": 68.18740129847342, "grad_norm": 0.19598156213760376, "learning_rate": 0.001, "loss": 1.7706, "step": 1165800 }, { "epoch": 68.19325027782652, "grad_norm": 0.1380053460597992, "learning_rate": 0.001, "loss": 1.772, "step": 1165900 }, { "epoch": 68.19909925717963, "grad_norm": 0.15567567944526672, "learning_rate": 0.001, "loss": 1.7731, "step": 1166000 }, { "epoch": 68.20494823653273, "grad_norm": 0.2371741682291031, "learning_rate": 0.001, "loss": 1.7721, "step": 1166100 }, { "epoch": 68.21079721588583, "grad_norm": 0.17000915110111237, "learning_rate": 0.001, "loss": 1.7752, "step": 1166200 }, { "epoch": 68.21664619523894, "grad_norm": 0.23933203518390656, "learning_rate": 0.001, "loss": 1.7756, "step": 1166300 }, { "epoch": 68.22249517459204, "grad_norm": 0.20606833696365356, "learning_rate": 0.001, "loss": 1.7782, "step": 1166400 }, { "epoch": 68.22834415394513, "grad_norm": 0.19679208099842072, "learning_rate": 0.001, "loss": 1.773, "step": 1166500 }, { "epoch": 68.23419313329823, "grad_norm": 0.14069321751594543, "learning_rate": 0.001, "loss": 1.7708, "step": 1166600 }, { "epoch": 68.24004211265134, "grad_norm": 0.17825476825237274, "learning_rate": 0.001, "loss": 1.7766, "step": 1166700 }, { "epoch": 68.24589109200444, "grad_norm": 0.19400262832641602, "learning_rate": 0.001, "loss": 1.7749, "step": 1166800 }, { "epoch": 68.25174007135755, "grad_norm": 0.21884991228580475, "learning_rate": 0.001, "loss": 1.7781, "step": 1166900 }, { "epoch": 68.25758905071065, "grad_norm": 0.18413285911083221, "learning_rate": 0.001, "loss": 1.7724, "step": 1167000 }, { "epoch": 68.26343803006375, "grad_norm": 0.17825067043304443, "learning_rate": 0.001, "loss": 1.7737, "step": 1167100 }, { "epoch": 68.26928700941686, "grad_norm": 0.15897397696971893, "learning_rate": 0.001, "loss": 1.7725, "step": 1167200 }, { "epoch": 68.27513598876996, "grad_norm": 0.1978849172592163, "learning_rate": 0.001, "loss": 1.7806, "step": 1167300 }, { "epoch": 68.28098496812306, "grad_norm": 0.2402716726064682, "learning_rate": 0.001, "loss": 1.7771, "step": 1167400 }, { "epoch": 68.28683394747617, "grad_norm": 0.14010784029960632, "learning_rate": 0.001, "loss": 1.7733, "step": 1167500 }, { "epoch": 68.29268292682927, "grad_norm": 0.16746172308921814, "learning_rate": 0.001, "loss": 1.7759, "step": 1167600 }, { "epoch": 68.29853190618238, "grad_norm": 0.20290566980838776, "learning_rate": 0.001, "loss": 1.7721, "step": 1167700 }, { "epoch": 68.30438088553548, "grad_norm": 0.1517534852027893, "learning_rate": 0.001, "loss": 1.7748, "step": 1167800 }, { "epoch": 68.31022986488858, "grad_norm": 0.2030346393585205, "learning_rate": 0.001, "loss": 1.7697, "step": 1167900 }, { "epoch": 68.31607884424167, "grad_norm": 0.17007343471050262, "learning_rate": 0.001, "loss": 1.7768, "step": 1168000 }, { "epoch": 68.32192782359478, "grad_norm": 0.1540011316537857, "learning_rate": 0.001, "loss": 1.7771, "step": 1168100 }, { "epoch": 68.32777680294788, "grad_norm": 0.18775026500225067, "learning_rate": 0.001, "loss": 1.7736, "step": 1168200 }, { "epoch": 68.33362578230098, "grad_norm": 0.16158324480056763, "learning_rate": 0.001, "loss": 1.7798, "step": 1168300 }, { "epoch": 68.33947476165409, "grad_norm": 0.1935267597436905, "learning_rate": 0.001, "loss": 1.7789, "step": 1168400 }, { "epoch": 68.34532374100719, "grad_norm": 0.15396708250045776, "learning_rate": 0.001, "loss": 1.7779, "step": 1168500 }, { "epoch": 68.3511727203603, "grad_norm": 0.2655694782733917, "learning_rate": 0.001, "loss": 1.7795, "step": 1168600 }, { "epoch": 68.3570216997134, "grad_norm": 0.16767534613609314, "learning_rate": 0.001, "loss": 1.7762, "step": 1168700 }, { "epoch": 68.3628706790665, "grad_norm": 0.1612904965877533, "learning_rate": 0.001, "loss": 1.7755, "step": 1168800 }, { "epoch": 68.36871965841961, "grad_norm": 0.153105691075325, "learning_rate": 0.001, "loss": 1.7701, "step": 1168900 }, { "epoch": 68.37456863777271, "grad_norm": 0.25395554304122925, "learning_rate": 0.001, "loss": 1.7764, "step": 1169000 }, { "epoch": 68.38041761712581, "grad_norm": 0.14287975430488586, "learning_rate": 0.001, "loss": 1.7774, "step": 1169100 }, { "epoch": 68.38626659647892, "grad_norm": 0.1845678836107254, "learning_rate": 0.001, "loss": 1.7788, "step": 1169200 }, { "epoch": 68.39211557583202, "grad_norm": 0.14953571557998657, "learning_rate": 0.001, "loss": 1.7776, "step": 1169300 }, { "epoch": 68.39796455518513, "grad_norm": 0.1584041714668274, "learning_rate": 0.001, "loss": 1.7738, "step": 1169400 }, { "epoch": 68.40381353453823, "grad_norm": 0.1764654517173767, "learning_rate": 0.001, "loss": 1.7813, "step": 1169500 }, { "epoch": 68.40966251389132, "grad_norm": 0.14371417462825775, "learning_rate": 0.001, "loss": 1.7773, "step": 1169600 }, { "epoch": 68.41551149324442, "grad_norm": 0.19437310099601746, "learning_rate": 0.001, "loss": 1.7796, "step": 1169700 }, { "epoch": 68.42136047259753, "grad_norm": 0.2819368243217468, "learning_rate": 0.001, "loss": 1.7742, "step": 1169800 }, { "epoch": 68.42720945195063, "grad_norm": 0.18947747349739075, "learning_rate": 0.001, "loss": 1.7766, "step": 1169900 }, { "epoch": 68.43305843130373, "grad_norm": 0.21149428188800812, "learning_rate": 0.001, "loss": 1.772, "step": 1170000 } ], "logging_steps": 100, "max_steps": 1709700, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.22079361006515e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }