diff --git "a/self_refine/qwen3vl_2b_10pct/checkpoint-1000/trainer_state.json" "b/self_refine/qwen3vl_2b_10pct/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/self_refine/qwen3vl_2b_10pct/checkpoint-1000/trainer_state.json" @@ -0,0 +1,9034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3696516033638296, + "eval_steps": 500.0, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss": 1.1095314025878906, + "epoch": 0.0003696516033638296, + "grad_norm": 115.42657470703125, + "learning_rate": 3.3333333333333335e-07, + "loss": 4.4381256103515625, + "refine_loss": 0.0, + "step": 1 + }, + { + "ce_loss": 1.208587646484375, + "epoch": 0.0007393032067276592, + "grad_norm": 106.51912689208984, + "learning_rate": 6.666666666666667e-07, + "loss": 4.8343505859375, + "refine_loss": 0.0, + "step": 2 + }, + { + "ce_loss": 1.1239862442016602, + "epoch": 0.0011089548100914888, + "grad_norm": 67.46305084228516, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.495944976806641, + "refine_loss": 0.0, + "step": 3 + }, + { + "ce_loss": 1.1538887023925781, + "epoch": 0.0014786064134553183, + "grad_norm": 80.5199203491211, + "learning_rate": 1.3333333333333334e-06, + "loss": 4.6155548095703125, + "refine_loss": 0.0, + "step": 4 + }, + { + "ce_loss": 1.172698974609375, + "epoch": 0.001848258016819148, + "grad_norm": 94.67391204833984, + "learning_rate": 1.6666666666666667e-06, + "loss": 4.6907958984375, + "refine_loss": 0.0, + "step": 5 + }, + { + "ce_loss": 1.1981201171875, + "epoch": 0.0022179096201829776, + "grad_norm": 71.18059539794922, + "learning_rate": 2.0000000000000003e-06, + "loss": 4.79248046875, + "refine_loss": 0.0, + "step": 6 + }, + { + "ce_loss": 1.2137451171875, + "epoch": 0.0025875612235468073, + "grad_norm": 76.77693176269531, + "learning_rate": 2.3333333333333336e-06, + "loss": 4.85498046875, + "refine_loss": 0.0, + "step": 7 + }, + { + "ce_loss": 1.200469970703125, + "epoch": 0.0029572128269106366, + "grad_norm": 78.92556762695312, + "learning_rate": 2.666666666666667e-06, + "loss": 4.8018798828125, + "refine_loss": 0.0, + "step": 8 + }, + { + "ce_loss": 0.9800025224685669, + "epoch": 0.0033268644302744664, + "grad_norm": 60.26092529296875, + "learning_rate": 3e-06, + "loss": 3.9200100898742676, + "refine_loss": 0.0, + "step": 9 + }, + { + "ce_loss": 0.9466476440429688, + "epoch": 0.003696516033638296, + "grad_norm": 38.90607833862305, + "learning_rate": 3.3333333333333333e-06, + "loss": 3.786590576171875, + "refine_loss": 0.0, + "step": 10 + }, + { + "ce_loss": 1.1175155639648438, + "epoch": 0.004066167637002125, + "grad_norm": 49.58339309692383, + "learning_rate": 3.6666666666666666e-06, + "loss": 4.470062255859375, + "refine_loss": 0.0, + "step": 11 + }, + { + "ce_loss": 0.8636951446533203, + "epoch": 0.004435819240365955, + "grad_norm": 22.93377685546875, + "learning_rate": 4.000000000000001e-06, + "loss": 3.4547805786132812, + "refine_loss": 0.0, + "step": 12 + }, + { + "ce_loss": 0.9014892578125, + "epoch": 0.004805470843729785, + "grad_norm": 37.222251892089844, + "learning_rate": 4.333333333333334e-06, + "loss": 3.60595703125, + "refine_loss": 0.0, + "step": 13 + }, + { + "ce_loss": 0.7541656494140625, + "epoch": 0.005175122447093615, + "grad_norm": 19.505725860595703, + "learning_rate": 4.666666666666667e-06, + "loss": 3.01666259765625, + "refine_loss": 0.0, + "step": 14 + }, + { + "ce_loss": 0.8701057434082031, + "epoch": 0.0055447740504574435, + "grad_norm": 27.46717643737793, + "learning_rate": 5e-06, + "loss": 3.4804229736328125, + "refine_loss": 0.0, + "step": 15 + }, + { + "ce_loss": 0.7495651245117188, + "epoch": 0.005914425653821273, + "grad_norm": 16.36368179321289, + "learning_rate": 5.333333333333334e-06, + "loss": 2.998260498046875, + "refine_loss": 0.0, + "step": 16 + }, + { + "ce_loss": 0.7133150100708008, + "epoch": 0.006284077257185103, + "grad_norm": 17.542661666870117, + "learning_rate": 5.666666666666667e-06, + "loss": 2.853260040283203, + "refine_loss": 0.0, + "step": 17 + }, + { + "ce_loss": 0.800048828125, + "epoch": 0.006653728860548933, + "grad_norm": 23.1492977142334, + "learning_rate": 6e-06, + "loss": 3.2001953125, + "refine_loss": 0.0, + "step": 18 + }, + { + "ce_loss": 0.7589683532714844, + "epoch": 0.0070233804639127624, + "grad_norm": 21.050495147705078, + "learning_rate": 6.333333333333333e-06, + "loss": 3.0358734130859375, + "refine_loss": 0.0, + "step": 19 + }, + { + "ce_loss": 0.62774658203125, + "epoch": 0.007393032067276592, + "grad_norm": 18.764785766601562, + "learning_rate": 6.666666666666667e-06, + "loss": 2.510986328125, + "refine_loss": 0.0, + "step": 20 + }, + { + "ce_loss": 0.780303955078125, + "epoch": 0.007762683670640421, + "grad_norm": 20.638334274291992, + "learning_rate": 7e-06, + "loss": 3.1212158203125, + "refine_loss": 0.0, + "step": 21 + }, + { + "ce_loss": 0.7488393783569336, + "epoch": 0.00813233527400425, + "grad_norm": 15.54475212097168, + "learning_rate": 7.333333333333333e-06, + "loss": 2.9953575134277344, + "refine_loss": 0.0, + "step": 22 + }, + { + "ce_loss": 0.783050537109375, + "epoch": 0.00850198687736808, + "grad_norm": 17.008934020996094, + "learning_rate": 7.666666666666667e-06, + "loss": 3.1322021484375, + "refine_loss": 0.0, + "step": 23 + }, + { + "ce_loss": 0.5948047637939453, + "epoch": 0.00887163848073191, + "grad_norm": 11.888081550598145, + "learning_rate": 8.000000000000001e-06, + "loss": 2.3792190551757812, + "refine_loss": 0.0, + "step": 24 + }, + { + "ce_loss": 0.5792684555053711, + "epoch": 0.00924129008409574, + "grad_norm": 14.061644554138184, + "learning_rate": 8.333333333333334e-06, + "loss": 2.3170738220214844, + "refine_loss": 0.0, + "step": 25 + }, + { + "ce_loss": 0.41719579696655273, + "epoch": 0.00961094168745957, + "grad_norm": 12.33115291595459, + "learning_rate": 8.666666666666668e-06, + "loss": 1.668783187866211, + "refine_loss": 0.0, + "step": 26 + }, + { + "ce_loss": 0.5683321952819824, + "epoch": 0.0099805932908234, + "grad_norm": 10.680821418762207, + "learning_rate": 9e-06, + "loss": 2.2733287811279297, + "refine_loss": 0.0, + "step": 27 + }, + { + "ce_loss": 0.49542999267578125, + "epoch": 0.01035024489418723, + "grad_norm": 13.718324661254883, + "learning_rate": 9.333333333333334e-06, + "loss": 1.981719970703125, + "refine_loss": 0.0, + "step": 28 + }, + { + "ce_loss": 0.4219522476196289, + "epoch": 0.010719896497551059, + "grad_norm": 10.822561264038086, + "learning_rate": 9.666666666666667e-06, + "loss": 1.6878089904785156, + "refine_loss": 0.0, + "step": 29 + }, + { + "ce_loss": 0.4342775344848633, + "epoch": 0.011089548100914887, + "grad_norm": 12.066676139831543, + "learning_rate": 1e-05, + "loss": 1.7371101379394531, + "refine_loss": 0.0, + "step": 30 + }, + { + "ce_loss": 0.6757955551147461, + "epoch": 0.011459199704278717, + "grad_norm": 19.141977310180664, + "learning_rate": 9.99997377618298e-06, + "loss": 2.7031822204589844, + "refine_loss": 0.0, + "step": 31 + }, + { + "ce_loss": 0.6072940826416016, + "epoch": 0.011828851307642546, + "grad_norm": 15.509014129638672, + "learning_rate": 9.999895105006995e-06, + "loss": 2.4291763305664062, + "refine_loss": 0.0, + "step": 32 + }, + { + "ce_loss": 0.537724494934082, + "epoch": 0.012198502911006376, + "grad_norm": 25.71844482421875, + "learning_rate": 9.999763987297266e-06, + "loss": 2.150897979736328, + "refine_loss": 0.0, + "step": 33 + }, + { + "ce_loss": 0.5607585906982422, + "epoch": 0.012568154514370206, + "grad_norm": 15.566309928894043, + "learning_rate": 9.99958042442916e-06, + "loss": 2.2430343627929688, + "refine_loss": 0.0, + "step": 34 + }, + { + "ce_loss": 0.6869277954101562, + "epoch": 0.012937806117734036, + "grad_norm": 20.027860641479492, + "learning_rate": 9.999344418328161e-06, + "loss": 2.747711181640625, + "refine_loss": 0.0, + "step": 35 + }, + { + "ce_loss": 0.5944743156433105, + "epoch": 0.013307457721097865, + "grad_norm": 16.63701820373535, + "learning_rate": 9.999055971469864e-06, + "loss": 2.377897262573242, + "refine_loss": 0.0, + "step": 36 + }, + { + "ce_loss": 0.6992354393005371, + "epoch": 0.013677109324461695, + "grad_norm": 11.606870651245117, + "learning_rate": 9.998715086879938e-06, + "loss": 2.7969417572021484, + "refine_loss": 0.0, + "step": 37 + }, + { + "ce_loss": 0.5765266418457031, + "epoch": 0.014046760927825525, + "grad_norm": 11.0557861328125, + "learning_rate": 9.998321768134101e-06, + "loss": 2.3061065673828125, + "refine_loss": 0.0, + "step": 38 + }, + { + "ce_loss": 0.6655747890472412, + "epoch": 0.014416412531189355, + "grad_norm": 12.986700057983398, + "learning_rate": 9.997876019358083e-06, + "loss": 2.662299156188965, + "refine_loss": 0.0, + "step": 39 + }, + { + "ce_loss": 0.4289727210998535, + "epoch": 0.014786064134553184, + "grad_norm": 13.342203140258789, + "learning_rate": 9.997377845227577e-06, + "loss": 1.715890884399414, + "refine_loss": 0.0, + "step": 40 + }, + { + "ce_loss": 0.6916851997375488, + "epoch": 0.015155715737917012, + "grad_norm": 12.713790893554688, + "learning_rate": 9.99682725096819e-06, + "loss": 2.7667407989501953, + "refine_loss": 0.0, + "step": 41 + }, + { + "ce_loss": 0.601173996925354, + "epoch": 0.015525367341280842, + "grad_norm": 14.236647605895996, + "learning_rate": 9.9962242423554e-06, + "loss": 2.404695987701416, + "refine_loss": 0.0, + "step": 42 + }, + { + "ce_loss": 0.46469664573669434, + "epoch": 0.015895018944644672, + "grad_norm": 14.929333686828613, + "learning_rate": 9.995568825714479e-06, + "loss": 1.8587865829467773, + "refine_loss": 0.0, + "step": 43 + }, + { + "ce_loss": 0.6185917854309082, + "epoch": 0.0162646705480085, + "grad_norm": 15.194661140441895, + "learning_rate": 9.99486100792044e-06, + "loss": 2.474367141723633, + "refine_loss": 0.0, + "step": 44 + }, + { + "ce_loss": 0.5654895305633545, + "epoch": 0.01663432215137233, + "grad_norm": 23.998876571655273, + "learning_rate": 9.994100796397954e-06, + "loss": 2.261958122253418, + "refine_loss": 0.0, + "step": 45 + }, + { + "ce_loss": 0.49540412425994873, + "epoch": 0.01700397375473616, + "grad_norm": 16.902929306030273, + "learning_rate": 9.993288199121283e-06, + "loss": 1.981616497039795, + "refine_loss": 0.0, + "step": 46 + }, + { + "ce_loss": 0.4463376998901367, + "epoch": 0.01737362535809999, + "grad_norm": 13.594534873962402, + "learning_rate": 9.992423224614185e-06, + "loss": 1.7853507995605469, + "refine_loss": 0.0, + "step": 47 + }, + { + "ce_loss": 0.6757125854492188, + "epoch": 0.01774327696146382, + "grad_norm": 11.977783203125, + "learning_rate": 9.991505881949837e-06, + "loss": 2.702850341796875, + "refine_loss": 0.0, + "step": 48 + }, + { + "ce_loss": 0.7240762710571289, + "epoch": 0.01811292856482765, + "grad_norm": 13.226062774658203, + "learning_rate": 9.990536180750724e-06, + "loss": 2.8963050842285156, + "refine_loss": 0.0, + "step": 49 + }, + { + "ce_loss": 0.5374159812927246, + "epoch": 0.01848258016819148, + "grad_norm": 10.17024040222168, + "learning_rate": 9.98951413118856e-06, + "loss": 2.1496639251708984, + "refine_loss": 0.0, + "step": 50 + }, + { + "ce_loss": 0.5094146728515625, + "epoch": 0.01885223177155531, + "grad_norm": 9.858248710632324, + "learning_rate": 9.988439743984155e-06, + "loss": 2.03765869140625, + "refine_loss": 0.0, + "step": 51 + }, + { + "ce_loss": 0.44951534271240234, + "epoch": 0.01922188337491914, + "grad_norm": 11.89635181427002, + "learning_rate": 9.987313030407325e-06, + "loss": 1.7980613708496094, + "refine_loss": 0.0, + "step": 52 + }, + { + "ce_loss": 0.6119918823242188, + "epoch": 0.01959153497828297, + "grad_norm": 14.51465129852295, + "learning_rate": 9.98613400227676e-06, + "loss": 2.447967529296875, + "refine_loss": 0.0, + "step": 53 + }, + { + "ce_loss": 0.5106334686279297, + "epoch": 0.0199611865816468, + "grad_norm": 13.626294136047363, + "learning_rate": 9.984902671959911e-06, + "loss": 2.0425338745117188, + "refine_loss": 0.0, + "step": 54 + }, + { + "ce_loss": 0.8825030326843262, + "epoch": 0.02033083818501063, + "grad_norm": 18.620595932006836, + "learning_rate": 9.983619052372847e-06, + "loss": 3.5300121307373047, + "refine_loss": 0.0, + "step": 55 + }, + { + "ce_loss": 0.5259208679199219, + "epoch": 0.02070048978837446, + "grad_norm": 9.573857307434082, + "learning_rate": 9.982283156980133e-06, + "loss": 2.1036834716796875, + "refine_loss": 0.0, + "step": 56 + }, + { + "ce_loss": 0.5167083740234375, + "epoch": 0.021070141391738288, + "grad_norm": 13.657594680786133, + "learning_rate": 9.980894999794678e-06, + "loss": 2.06683349609375, + "refine_loss": 0.0, + "step": 57 + }, + { + "ce_loss": 0.6286582946777344, + "epoch": 0.021439792995102118, + "grad_norm": 9.296985626220703, + "learning_rate": 9.979454595377594e-06, + "loss": 2.5146331787109375, + "refine_loss": 0.0, + "step": 58 + }, + { + "ce_loss": 0.5636241436004639, + "epoch": 0.021809444598465944, + "grad_norm": 9.587523460388184, + "learning_rate": 9.97796195883804e-06, + "loss": 2.2544965744018555, + "refine_loss": 0.0, + "step": 59 + }, + { + "ce_loss": 0.5001065731048584, + "epoch": 0.022179096201829774, + "grad_norm": 10.750041007995605, + "learning_rate": 9.97641710583307e-06, + "loss": 2.0004262924194336, + "refine_loss": 0.0, + "step": 60 + }, + { + "ce_loss": 0.4619925022125244, + "epoch": 0.022548747805193604, + "grad_norm": 10.245471000671387, + "learning_rate": 9.97482005256746e-06, + "loss": 1.8479700088500977, + "refine_loss": 0.0, + "step": 61 + }, + { + "ce_loss": 0.46030664443969727, + "epoch": 0.022918399408557433, + "grad_norm": 8.520600318908691, + "learning_rate": 9.973170815793543e-06, + "loss": 1.841226577758789, + "refine_loss": 0.0, + "step": 62 + }, + { + "ce_loss": 0.4949650764465332, + "epoch": 0.023288051011921263, + "grad_norm": 9.956713676452637, + "learning_rate": 9.971469412811032e-06, + "loss": 1.9798603057861328, + "refine_loss": 0.0, + "step": 63 + }, + { + "ce_loss": 0.4925670623779297, + "epoch": 0.023657702615285093, + "grad_norm": 19.084941864013672, + "learning_rate": 9.969715861466839e-06, + "loss": 1.9702682495117188, + "refine_loss": 0.0, + "step": 64 + }, + { + "ce_loss": 0.5608302354812622, + "epoch": 0.024027354218648923, + "grad_norm": 13.109268188476562, + "learning_rate": 9.96791018015489e-06, + "loss": 2.243320941925049, + "refine_loss": 0.0, + "step": 65 + }, + { + "ce_loss": 0.5516796112060547, + "epoch": 0.024397005822012752, + "grad_norm": 11.381149291992188, + "learning_rate": 9.966052387815923e-06, + "loss": 2.2067184448242188, + "refine_loss": 0.0, + "step": 66 + }, + { + "ce_loss": 0.3931121826171875, + "epoch": 0.024766657425376582, + "grad_norm": 9.816267967224121, + "learning_rate": 9.964142503937305e-06, + "loss": 1.57244873046875, + "refine_loss": 0.0, + "step": 67 + }, + { + "ce_loss": 0.31940579414367676, + "epoch": 0.025136309028740412, + "grad_norm": 10.21517562866211, + "learning_rate": 9.962180548552812e-06, + "loss": 1.277623176574707, + "refine_loss": 0.0, + "step": 68 + }, + { + "ce_loss": 0.4471263885498047, + "epoch": 0.02550596063210424, + "grad_norm": 9.68551254272461, + "learning_rate": 9.96016654224243e-06, + "loss": 1.7885055541992188, + "refine_loss": 0.0, + "step": 69 + }, + { + "ce_loss": 0.38400816917419434, + "epoch": 0.02587561223546807, + "grad_norm": 8.190732955932617, + "learning_rate": 9.958100506132127e-06, + "loss": 1.5360326766967773, + "refine_loss": 0.0, + "step": 70 + }, + { + "ce_loss": 0.4605998992919922, + "epoch": 0.0262452638388319, + "grad_norm": 12.649483680725098, + "learning_rate": 9.955982461893648e-06, + "loss": 1.8423995971679688, + "refine_loss": 0.0, + "step": 71 + }, + { + "ce_loss": 0.4523935317993164, + "epoch": 0.02661491544219573, + "grad_norm": 9.617040634155273, + "learning_rate": 9.953812431744274e-06, + "loss": 1.8095741271972656, + "refine_loss": 0.0, + "step": 72 + }, + { + "ce_loss": 0.582726001739502, + "epoch": 0.02698456704555956, + "grad_norm": 12.793085098266602, + "learning_rate": 9.951590438446597e-06, + "loss": 2.330904006958008, + "refine_loss": 0.0, + "step": 73 + }, + { + "ce_loss": 0.4675734043121338, + "epoch": 0.02735421864892339, + "grad_norm": 10.02615737915039, + "learning_rate": 9.94931650530827e-06, + "loss": 1.8702936172485352, + "refine_loss": 0.0, + "step": 74 + }, + { + "ce_loss": 0.43453407287597656, + "epoch": 0.02772387025228722, + "grad_norm": 11.872008323669434, + "learning_rate": 9.946990656181782e-06, + "loss": 1.7381362915039062, + "refine_loss": 0.0, + "step": 75 + }, + { + "ce_loss": 0.27246296405792236, + "epoch": 0.02809352185565105, + "grad_norm": 10.81119155883789, + "learning_rate": 9.944612915464183e-06, + "loss": 1.0898518562316895, + "refine_loss": 0.0, + "step": 76 + }, + { + "ce_loss": 0.5746393203735352, + "epoch": 0.02846317345901488, + "grad_norm": 11.642266273498535, + "learning_rate": 9.942183308096853e-06, + "loss": 2.2985572814941406, + "refine_loss": 0.0, + "step": 77 + }, + { + "ce_loss": 0.3775515556335449, + "epoch": 0.02883282506237871, + "grad_norm": 7.531744003295898, + "learning_rate": 9.93970185956522e-06, + "loss": 1.5102062225341797, + "refine_loss": 0.0, + "step": 78 + }, + { + "ce_loss": 0.5148732662200928, + "epoch": 0.02920247666574254, + "grad_norm": 9.864612579345703, + "learning_rate": 9.93716859589851e-06, + "loss": 2.059493064880371, + "refine_loss": 0.0, + "step": 79 + }, + { + "ce_loss": 0.4321134090423584, + "epoch": 0.02957212826910637, + "grad_norm": 10.719852447509766, + "learning_rate": 9.934583543669454e-06, + "loss": 1.7284536361694336, + "refine_loss": 0.0, + "step": 80 + }, + { + "ce_loss": 0.5056421756744385, + "epoch": 0.0299417798724702, + "grad_norm": 11.71694564819336, + "learning_rate": 9.93194672999403e-06, + "loss": 2.022568702697754, + "refine_loss": 0.0, + "step": 81 + }, + { + "ce_loss": 0.3861856460571289, + "epoch": 0.030311431475834025, + "grad_norm": 11.259998321533203, + "learning_rate": 9.929258182531167e-06, + "loss": 1.5447425842285156, + "refine_loss": 0.0, + "step": 82 + }, + { + "ce_loss": 0.48786067962646484, + "epoch": 0.030681083079197854, + "grad_norm": 13.072247505187988, + "learning_rate": 9.926517929482454e-06, + "loss": 1.9514427185058594, + "refine_loss": 0.0, + "step": 83 + }, + { + "ce_loss": 0.4842214584350586, + "epoch": 0.031050734682561684, + "grad_norm": 9.760059356689453, + "learning_rate": 9.923725999591846e-06, + "loss": 1.9368858337402344, + "refine_loss": 0.0, + "step": 84 + }, + { + "ce_loss": 0.46020638942718506, + "epoch": 0.031420386285925514, + "grad_norm": 9.29644775390625, + "learning_rate": 9.920882422145372e-06, + "loss": 1.8408255577087402, + "refine_loss": 0.0, + "step": 85 + }, + { + "ce_loss": 0.42596590518951416, + "epoch": 0.031790037889289344, + "grad_norm": 8.990376472473145, + "learning_rate": 9.917987226970811e-06, + "loss": 1.7038636207580566, + "refine_loss": 0.0, + "step": 86 + }, + { + "ce_loss": 0.46361875534057617, + "epoch": 0.03215968949265317, + "grad_norm": 8.638200759887695, + "learning_rate": 9.91504044443739e-06, + "loss": 1.8544750213623047, + "refine_loss": 0.0, + "step": 87 + }, + { + "ce_loss": 0.4239847660064697, + "epoch": 0.032529341096017, + "grad_norm": 8.489425659179688, + "learning_rate": 9.912042105455462e-06, + "loss": 1.695939064025879, + "refine_loss": 0.0, + "step": 88 + }, + { + "ce_loss": 0.5140061974525452, + "epoch": 0.03289899269938083, + "grad_norm": 11.088143348693848, + "learning_rate": 9.908992241476189e-06, + "loss": 2.0560247898101807, + "refine_loss": 0.0, + "step": 89 + }, + { + "ce_loss": 0.4734039306640625, + "epoch": 0.03326864430274466, + "grad_norm": 8.540122032165527, + "learning_rate": 9.905890884491196e-06, + "loss": 1.89361572265625, + "refine_loss": 0.0, + "step": 90 + }, + { + "ce_loss": 0.4609231948852539, + "epoch": 0.03363829590610849, + "grad_norm": 9.909972190856934, + "learning_rate": 9.902738067032254e-06, + "loss": 1.8436927795410156, + "refine_loss": 0.0, + "step": 91 + }, + { + "ce_loss": 0.37462282180786133, + "epoch": 0.03400794750947232, + "grad_norm": 8.560324668884277, + "learning_rate": 9.899533822170922e-06, + "loss": 1.4984912872314453, + "refine_loss": 0.0, + "step": 92 + }, + { + "ce_loss": 0.4274463653564453, + "epoch": 0.03437759911283615, + "grad_norm": 9.640644073486328, + "learning_rate": 9.896278183518216e-06, + "loss": 1.7097854614257812, + "refine_loss": 0.0, + "step": 93 + }, + { + "ce_loss": 0.5064095258712769, + "epoch": 0.03474725071619998, + "grad_norm": 10.19467544555664, + "learning_rate": 9.892971185224244e-06, + "loss": 2.0256381034851074, + "refine_loss": 0.0, + "step": 94 + }, + { + "ce_loss": 0.5968835353851318, + "epoch": 0.03511690231956381, + "grad_norm": 9.20285415649414, + "learning_rate": 9.889612861977855e-06, + "loss": 2.3875341415405273, + "refine_loss": 0.0, + "step": 95 + }, + { + "ce_loss": 0.4285084009170532, + "epoch": 0.03548655392292764, + "grad_norm": 8.602506637573242, + "learning_rate": 9.886203249006265e-06, + "loss": 1.714033603668213, + "refine_loss": 0.0, + "step": 96 + }, + { + "ce_loss": 0.5147629976272583, + "epoch": 0.03585620552629147, + "grad_norm": 9.472990989685059, + "learning_rate": 9.882742382074707e-06, + "loss": 2.059051990509033, + "refine_loss": 0.0, + "step": 97 + }, + { + "ce_loss": 0.3420184850692749, + "epoch": 0.0362258571296553, + "grad_norm": 8.319692611694336, + "learning_rate": 9.879230297486034e-06, + "loss": 1.3680739402770996, + "refine_loss": 0.0, + "step": 98 + }, + { + "ce_loss": 0.506950855255127, + "epoch": 0.03659550873301913, + "grad_norm": 12.215727806091309, + "learning_rate": 9.875667032080354e-06, + "loss": 2.027803421020508, + "refine_loss": 0.0, + "step": 99 + }, + { + "ce_loss": 0.4197876453399658, + "epoch": 0.03696516033638296, + "grad_norm": 25.909793853759766, + "learning_rate": 9.872052623234632e-06, + "loss": 1.6791505813598633, + "refine_loss": 0.0, + "step": 100 + }, + { + "ce_loss": 0.5791604518890381, + "epoch": 0.03733481193974679, + "grad_norm": 10.472227096557617, + "learning_rate": 9.868387108862307e-06, + "loss": 2.3166418075561523, + "refine_loss": 0.0, + "step": 101 + }, + { + "ce_loss": 0.5240020751953125, + "epoch": 0.03770446354311062, + "grad_norm": 15.826552391052246, + "learning_rate": 9.864670527412891e-06, + "loss": 2.09600830078125, + "refine_loss": 0.0, + "step": 102 + }, + { + "ce_loss": 0.4309847354888916, + "epoch": 0.03807411514647445, + "grad_norm": 10.410178184509277, + "learning_rate": 9.860902917871566e-06, + "loss": 1.7239389419555664, + "refine_loss": 0.0, + "step": 103 + }, + { + "ce_loss": 0.4111976623535156, + "epoch": 0.03844376674983828, + "grad_norm": 12.73709774017334, + "learning_rate": 9.857084319758772e-06, + "loss": 1.6447906494140625, + "refine_loss": 0.0, + "step": 104 + }, + { + "ce_loss": 0.4270365238189697, + "epoch": 0.03881341835320211, + "grad_norm": 10.375432014465332, + "learning_rate": 9.853214773129796e-06, + "loss": 1.708146095275879, + "refine_loss": 0.0, + "step": 105 + }, + { + "ce_loss": 0.5208024978637695, + "epoch": 0.03918306995656594, + "grad_norm": 8.4111967086792, + "learning_rate": 9.849294318574353e-06, + "loss": 2.083209991455078, + "refine_loss": 0.0, + "step": 106 + }, + { + "ce_loss": 0.3873509168624878, + "epoch": 0.03955272155992977, + "grad_norm": 10.028768539428711, + "learning_rate": 9.845322997216153e-06, + "loss": 1.5494036674499512, + "refine_loss": 0.0, + "step": 107 + }, + { + "ce_loss": 0.4204486608505249, + "epoch": 0.0399223731632936, + "grad_norm": 8.498994827270508, + "learning_rate": 9.841300850712479e-06, + "loss": 1.6817946434020996, + "refine_loss": 0.0, + "step": 108 + }, + { + "ce_loss": 0.35520946979522705, + "epoch": 0.04029202476665743, + "grad_norm": 9.784452438354492, + "learning_rate": 9.837227921253747e-06, + "loss": 1.4208378791809082, + "refine_loss": 0.0, + "step": 109 + }, + { + "ce_loss": 0.5272456407546997, + "epoch": 0.04066167637002126, + "grad_norm": 12.525003433227539, + "learning_rate": 9.833104251563058e-06, + "loss": 2.108982563018799, + "refine_loss": 0.0, + "step": 110 + }, + { + "ce_loss": 0.5461859703063965, + "epoch": 0.04103132797338509, + "grad_norm": 10.434538841247559, + "learning_rate": 9.828929884895753e-06, + "loss": 2.184743881225586, + "refine_loss": 0.0, + "step": 111 + }, + { + "ce_loss": 0.37354278564453125, + "epoch": 0.04140097957674892, + "grad_norm": 10.147275924682617, + "learning_rate": 9.824704865038967e-06, + "loss": 1.494171142578125, + "refine_loss": 0.0, + "step": 112 + }, + { + "ce_loss": 0.42732977867126465, + "epoch": 0.04177063118011275, + "grad_norm": 10.19455337524414, + "learning_rate": 9.820429236311158e-06, + "loss": 1.7093191146850586, + "refine_loss": 0.0, + "step": 113 + }, + { + "ce_loss": 0.46251100301742554, + "epoch": 0.042140282783476576, + "grad_norm": 9.22331428527832, + "learning_rate": 9.816103043561648e-06, + "loss": 1.8500440120697021, + "refine_loss": 0.0, + "step": 114 + }, + { + "ce_loss": 0.443803608417511, + "epoch": 0.042509934386840406, + "grad_norm": 12.951189994812012, + "learning_rate": 9.811726332170153e-06, + "loss": 1.775214433670044, + "refine_loss": 0.0, + "step": 115 + }, + { + "ce_loss": 0.423317015171051, + "epoch": 0.042879585990204236, + "grad_norm": 7.769314765930176, + "learning_rate": 9.807299148046301e-06, + "loss": 1.693268060684204, + "refine_loss": 0.0, + "step": 116 + }, + { + "ce_loss": 0.43371152877807617, + "epoch": 0.04324923759356806, + "grad_norm": 21.21959686279297, + "learning_rate": 9.802821537629162e-06, + "loss": 1.7348461151123047, + "refine_loss": 0.0, + "step": 117 + }, + { + "ce_loss": 0.5766236782073975, + "epoch": 0.04361888919693189, + "grad_norm": 10.444099426269531, + "learning_rate": 9.798293547886748e-06, + "loss": 2.30649471282959, + "refine_loss": 0.0, + "step": 118 + }, + { + "ce_loss": 0.5190873146057129, + "epoch": 0.04398854080029572, + "grad_norm": 11.069305419921875, + "learning_rate": 9.79371522631553e-06, + "loss": 2.0763492584228516, + "refine_loss": 0.0, + "step": 119 + }, + { + "ce_loss": 0.4422330856323242, + "epoch": 0.04435819240365955, + "grad_norm": 12.272151947021484, + "learning_rate": 9.789086620939936e-06, + "loss": 1.7689323425292969, + "refine_loss": 0.0, + "step": 120 + }, + { + "ce_loss": 0.5175392627716064, + "epoch": 0.04472784400702338, + "grad_norm": 8.745708465576172, + "learning_rate": 9.784407780311845e-06, + "loss": 2.070157051086426, + "refine_loss": 0.0, + "step": 121 + }, + { + "ce_loss": 0.587367057800293, + "epoch": 0.04509749561038721, + "grad_norm": 10.431843757629395, + "learning_rate": 9.779678753510082e-06, + "loss": 2.349468231201172, + "refine_loss": 0.0, + "step": 122 + }, + { + "ce_loss": 0.5118415355682373, + "epoch": 0.04546714721375104, + "grad_norm": 8.445225715637207, + "learning_rate": 9.774899590139897e-06, + "loss": 2.047366142272949, + "refine_loss": 0.0, + "step": 123 + }, + { + "ce_loss": 0.46656978130340576, + "epoch": 0.04583679881711487, + "grad_norm": 9.370891571044922, + "learning_rate": 9.770070340332457e-06, + "loss": 1.866279125213623, + "refine_loss": 0.0, + "step": 124 + }, + { + "ce_loss": 0.6051151752471924, + "epoch": 0.0462064504204787, + "grad_norm": 9.541234970092773, + "learning_rate": 9.765191054744305e-06, + "loss": 2.4204607009887695, + "refine_loss": 0.0, + "step": 125 + }, + { + "ce_loss": 0.4347813129425049, + "epoch": 0.046576102023842526, + "grad_norm": 10.442450523376465, + "learning_rate": 9.76026178455684e-06, + "loss": 1.7391252517700195, + "refine_loss": 0.0, + "step": 126 + }, + { + "ce_loss": 0.46814584732055664, + "epoch": 0.046945753627206356, + "grad_norm": 9.998276710510254, + "learning_rate": 9.755282581475769e-06, + "loss": 1.8725833892822266, + "refine_loss": 0.0, + "step": 127 + }, + { + "ce_loss": 0.46575212478637695, + "epoch": 0.047315405230570186, + "grad_norm": 10.918929100036621, + "learning_rate": 9.75025349773058e-06, + "loss": 1.8630084991455078, + "refine_loss": 0.0, + "step": 128 + }, + { + "ce_loss": 0.4933452606201172, + "epoch": 0.047685056833934016, + "grad_norm": 8.451947212219238, + "learning_rate": 9.745174586073982e-06, + "loss": 1.9733810424804688, + "refine_loss": 0.0, + "step": 129 + }, + { + "ce_loss": 0.4042043685913086, + "epoch": 0.048054708437297845, + "grad_norm": 8.686161041259766, + "learning_rate": 9.740045899781353e-06, + "loss": 1.6168174743652344, + "refine_loss": 0.0, + "step": 130 + }, + { + "ce_loss": 0.49583637714385986, + "epoch": 0.048424360040661675, + "grad_norm": 11.208438873291016, + "learning_rate": 9.734867492650187e-06, + "loss": 1.9833455085754395, + "refine_loss": 0.0, + "step": 131 + }, + { + "ce_loss": 0.5204379558563232, + "epoch": 0.048794011644025505, + "grad_norm": 13.64596176147461, + "learning_rate": 9.729639418999524e-06, + "loss": 2.081751823425293, + "refine_loss": 0.0, + "step": 132 + }, + { + "ce_loss": 0.5283682346343994, + "epoch": 0.049163663247389335, + "grad_norm": 10.409852027893066, + "learning_rate": 9.724361733669383e-06, + "loss": 2.1134729385375977, + "refine_loss": 0.0, + "step": 133 + }, + { + "ce_loss": 0.41047000885009766, + "epoch": 0.049533314850753164, + "grad_norm": 8.218536376953125, + "learning_rate": 9.719034492020183e-06, + "loss": 1.6418800354003906, + "refine_loss": 0.0, + "step": 134 + }, + { + "ce_loss": 0.5120958089828491, + "epoch": 0.049902966454116994, + "grad_norm": 12.63016128540039, + "learning_rate": 9.713657749932172e-06, + "loss": 2.0483832359313965, + "refine_loss": 0.0, + "step": 135 + }, + { + "ce_loss": 0.4475950002670288, + "epoch": 0.050272618057480824, + "grad_norm": 8.611896514892578, + "learning_rate": 9.708231563804828e-06, + "loss": 1.7903800010681152, + "refine_loss": 0.0, + "step": 136 + }, + { + "ce_loss": 0.5860188007354736, + "epoch": 0.050642269660844653, + "grad_norm": 11.123357772827148, + "learning_rate": 9.702755990556277e-06, + "loss": 2.3440752029418945, + "refine_loss": 0.0, + "step": 137 + }, + { + "ce_loss": 0.35729652643203735, + "epoch": 0.05101192126420848, + "grad_norm": 8.195504188537598, + "learning_rate": 9.697231087622691e-06, + "loss": 1.4291861057281494, + "refine_loss": 0.0, + "step": 138 + }, + { + "ce_loss": 0.5091031789779663, + "epoch": 0.05138157286757231, + "grad_norm": 10.71363639831543, + "learning_rate": 9.691656912957686e-06, + "loss": 2.0364127159118652, + "refine_loss": 0.0, + "step": 139 + }, + { + "ce_loss": 0.6348462700843811, + "epoch": 0.05175122447093614, + "grad_norm": 10.407415390014648, + "learning_rate": 9.68603352503172e-06, + "loss": 2.5393850803375244, + "refine_loss": 0.0, + "step": 140 + }, + { + "ce_loss": 0.5838260650634766, + "epoch": 0.05212087607429997, + "grad_norm": 10.073226928710938, + "learning_rate": 9.680360982831467e-06, + "loss": 2.3353042602539062, + "refine_loss": 0.0, + "step": 141 + }, + { + "ce_loss": 0.5728874206542969, + "epoch": 0.0524905276776638, + "grad_norm": 8.901334762573242, + "learning_rate": 9.674639345859213e-06, + "loss": 2.2915496826171875, + "refine_loss": 0.0, + "step": 142 + }, + { + "ce_loss": 0.4548962712287903, + "epoch": 0.05286017928102763, + "grad_norm": 10.343681335449219, + "learning_rate": 9.668868674132224e-06, + "loss": 1.8195850849151611, + "refine_loss": 0.0, + "step": 143 + }, + { + "ce_loss": 0.5592417120933533, + "epoch": 0.05322983088439146, + "grad_norm": 9.573619842529297, + "learning_rate": 9.663049028182112e-06, + "loss": 2.236966848373413, + "refine_loss": 0.0, + "step": 144 + }, + { + "ce_loss": 0.5424580574035645, + "epoch": 0.05359948248775529, + "grad_norm": 10.394701957702637, + "learning_rate": 9.657180469054213e-06, + "loss": 2.169832229614258, + "refine_loss": 0.0, + "step": 145 + }, + { + "ce_loss": 0.36916297674179077, + "epoch": 0.05396913409111912, + "grad_norm": 7.844456195831299, + "learning_rate": 9.651263058306932e-06, + "loss": 1.476651906967163, + "refine_loss": 0.0, + "step": 146 + }, + { + "ce_loss": 0.3137897849082947, + "epoch": 0.05433878569448295, + "grad_norm": 14.026036262512207, + "learning_rate": 9.645296858011109e-06, + "loss": 1.2551591396331787, + "refine_loss": 0.0, + "step": 147 + }, + { + "ce_loss": 0.3144235610961914, + "epoch": 0.05470843729784678, + "grad_norm": 9.94989013671875, + "learning_rate": 9.639281930749363e-06, + "loss": 1.2576942443847656, + "refine_loss": 0.0, + "step": 148 + }, + { + "ce_loss": 0.47137540578842163, + "epoch": 0.05507808890121061, + "grad_norm": 11.059707641601562, + "learning_rate": 9.633218339615433e-06, + "loss": 1.8855016231536865, + "refine_loss": 0.0, + "step": 149 + }, + { + "ce_loss": 0.37241339683532715, + "epoch": 0.05544774050457444, + "grad_norm": 7.970921039581299, + "learning_rate": 9.627106148213521e-06, + "loss": 1.4896535873413086, + "refine_loss": 0.0, + "step": 150 + }, + { + "ce_loss": 0.5733976364135742, + "epoch": 0.05581739210793827, + "grad_norm": 12.695771217346191, + "learning_rate": 9.620945420657625e-06, + "loss": 2.293590545654297, + "refine_loss": 0.0, + "step": 151 + }, + { + "ce_loss": 0.48796534538269043, + "epoch": 0.0561870437113021, + "grad_norm": 10.910393714904785, + "learning_rate": 9.61473622157086e-06, + "loss": 1.9518613815307617, + "refine_loss": 0.0, + "step": 152 + }, + { + "ce_loss": 0.48050570487976074, + "epoch": 0.05655669531466593, + "grad_norm": 7.752503395080566, + "learning_rate": 9.608478616084784e-06, + "loss": 1.922022819519043, + "refine_loss": 0.0, + "step": 153 + }, + { + "ce_loss": 0.24023199081420898, + "epoch": 0.05692634691802976, + "grad_norm": 8.373605728149414, + "learning_rate": 9.602172669838721e-06, + "loss": 0.9609279632568359, + "refine_loss": 0.0, + "step": 154 + }, + { + "ce_loss": 0.522698163986206, + "epoch": 0.05729599852139359, + "grad_norm": 11.199427604675293, + "learning_rate": 9.595818448979061e-06, + "loss": 2.090792655944824, + "refine_loss": 0.0, + "step": 155 + }, + { + "ce_loss": 0.3466529846191406, + "epoch": 0.05766565012475742, + "grad_norm": 13.103911399841309, + "learning_rate": 9.589416020158577e-06, + "loss": 1.3866119384765625, + "refine_loss": 0.0, + "step": 156 + }, + { + "ce_loss": 0.5445573329925537, + "epoch": 0.05803530172812125, + "grad_norm": 10.399889945983887, + "learning_rate": 9.582965450535716e-06, + "loss": 2.178229331970215, + "refine_loss": 0.0, + "step": 157 + }, + { + "ce_loss": 0.48961734771728516, + "epoch": 0.05840495333148508, + "grad_norm": 11.567157745361328, + "learning_rate": 9.5764668077739e-06, + "loss": 1.9584693908691406, + "refine_loss": 0.0, + "step": 158 + }, + { + "ce_loss": 0.37181878089904785, + "epoch": 0.05877460493484891, + "grad_norm": 9.249194145202637, + "learning_rate": 9.569920160040815e-06, + "loss": 1.4872751235961914, + "refine_loss": 0.0, + "step": 159 + }, + { + "ce_loss": 0.3462662696838379, + "epoch": 0.05914425653821274, + "grad_norm": 7.767069339752197, + "learning_rate": 9.563325576007702e-06, + "loss": 1.3850650787353516, + "refine_loss": 0.0, + "step": 160 + }, + { + "ce_loss": 0.44601118564605713, + "epoch": 0.05951390814157657, + "grad_norm": 10.365139961242676, + "learning_rate": 9.556683124848624e-06, + "loss": 1.7840447425842285, + "refine_loss": 0.0, + "step": 161 + }, + { + "ce_loss": 0.3050652742385864, + "epoch": 0.0598835597449404, + "grad_norm": 11.97591495513916, + "learning_rate": 9.549992876239753e-06, + "loss": 1.2202610969543457, + "refine_loss": 0.0, + "step": 162 + }, + { + "ce_loss": 0.4575512409210205, + "epoch": 0.06025321134830423, + "grad_norm": 44.516170501708984, + "learning_rate": 9.54325490035863e-06, + "loss": 1.830204963684082, + "refine_loss": 0.0, + "step": 163 + }, + { + "ce_loss": 0.4702277183532715, + "epoch": 0.06062286295166805, + "grad_norm": 9.954727172851562, + "learning_rate": 9.536469267883432e-06, + "loss": 1.880910873413086, + "refine_loss": 0.0, + "step": 164 + }, + { + "ce_loss": 0.5686862468719482, + "epoch": 0.06099251455503188, + "grad_norm": 10.590654373168945, + "learning_rate": 9.529636049992235e-06, + "loss": 2.274744987487793, + "refine_loss": 0.0, + "step": 165 + }, + { + "ce_loss": 0.568974494934082, + "epoch": 0.06136216615839571, + "grad_norm": 9.985755920410156, + "learning_rate": 9.52275531836226e-06, + "loss": 2.275897979736328, + "refine_loss": 0.0, + "step": 166 + }, + { + "ce_loss": 0.5471830368041992, + "epoch": 0.06173181776175954, + "grad_norm": 9.16679859161377, + "learning_rate": 9.515827145169128e-06, + "loss": 2.188732147216797, + "refine_loss": 0.0, + "step": 167 + }, + { + "ce_loss": 0.39590275287628174, + "epoch": 0.06210146936512337, + "grad_norm": 8.725578308105469, + "learning_rate": 9.508851603086094e-06, + "loss": 1.583611011505127, + "refine_loss": 0.0, + "step": 168 + }, + { + "ce_loss": 0.5505349636077881, + "epoch": 0.0624711209684872, + "grad_norm": 9.873892784118652, + "learning_rate": 9.501828765283295e-06, + "loss": 2.2021398544311523, + "refine_loss": 0.0, + "step": 169 + }, + { + "ce_loss": 0.5248575210571289, + "epoch": 0.06284077257185103, + "grad_norm": 8.752303123474121, + "learning_rate": 9.494758705426978e-06, + "loss": 2.0994300842285156, + "refine_loss": 0.0, + "step": 170 + }, + { + "ce_loss": 0.5002797245979309, + "epoch": 0.06321042417521486, + "grad_norm": 7.338254928588867, + "learning_rate": 9.487641497678724e-06, + "loss": 2.0011188983917236, + "refine_loss": 0.0, + "step": 171 + }, + { + "ce_loss": 0.4642770290374756, + "epoch": 0.06358007577857869, + "grad_norm": 7.788568019866943, + "learning_rate": 9.480477216694674e-06, + "loss": 1.8571081161499023, + "refine_loss": 0.0, + "step": 172 + }, + { + "ce_loss": 0.4884679317474365, + "epoch": 0.06394972738194252, + "grad_norm": 8.496161460876465, + "learning_rate": 9.473265937624748e-06, + "loss": 1.953871726989746, + "refine_loss": 0.0, + "step": 173 + }, + { + "ce_loss": 0.5106401443481445, + "epoch": 0.06431937898530635, + "grad_norm": 8.824239730834961, + "learning_rate": 9.466007736111846e-06, + "loss": 2.042560577392578, + "refine_loss": 0.0, + "step": 174 + }, + { + "ce_loss": 0.5915066003799438, + "epoch": 0.06468903058867018, + "grad_norm": 13.448558807373047, + "learning_rate": 9.458702688291072e-06, + "loss": 2.3660264015197754, + "refine_loss": 0.0, + "step": 175 + }, + { + "ce_loss": 0.40168774127960205, + "epoch": 0.065058682192034, + "grad_norm": 7.067381381988525, + "learning_rate": 9.451350870788922e-06, + "loss": 1.6067509651184082, + "refine_loss": 0.0, + "step": 176 + }, + { + "ce_loss": 0.5602824687957764, + "epoch": 0.06542833379539784, + "grad_norm": 9.076802253723145, + "learning_rate": 9.443952360722477e-06, + "loss": 2.2411298751831055, + "refine_loss": 0.0, + "step": 177 + }, + { + "ce_loss": 0.45456504821777344, + "epoch": 0.06579798539876167, + "grad_norm": 9.775382995605469, + "learning_rate": 9.436507235698613e-06, + "loss": 1.8182601928710938, + "refine_loss": 0.0, + "step": 178 + }, + { + "ce_loss": 0.4774221181869507, + "epoch": 0.0661676370021255, + "grad_norm": 9.03006649017334, + "learning_rate": 9.429015573813163e-06, + "loss": 1.9096884727478027, + "refine_loss": 0.0, + "step": 179 + }, + { + "ce_loss": 0.4619652032852173, + "epoch": 0.06653728860548933, + "grad_norm": 9.270119667053223, + "learning_rate": 9.421477453650118e-06, + "loss": 1.8478608131408691, + "refine_loss": 0.0, + "step": 180 + }, + { + "ce_loss": 0.5257446765899658, + "epoch": 0.06690694020885316, + "grad_norm": 9.222434997558594, + "learning_rate": 9.413892954280793e-06, + "loss": 2.1029787063598633, + "refine_loss": 0.0, + "step": 181 + }, + { + "ce_loss": 0.4100075960159302, + "epoch": 0.06727659181221698, + "grad_norm": 7.979343891143799, + "learning_rate": 9.406262155262995e-06, + "loss": 1.6400303840637207, + "refine_loss": 0.0, + "step": 182 + }, + { + "ce_loss": 0.4897352457046509, + "epoch": 0.06764624341558081, + "grad_norm": 9.652615547180176, + "learning_rate": 9.398585136640195e-06, + "loss": 1.9589409828186035, + "refine_loss": 0.0, + "step": 183 + }, + { + "ce_loss": 0.46136927604675293, + "epoch": 0.06801589501894464, + "grad_norm": 28.561424255371094, + "learning_rate": 9.390861978940687e-06, + "loss": 1.8454771041870117, + "refine_loss": 0.0, + "step": 184 + }, + { + "ce_loss": 0.41747748851776123, + "epoch": 0.06838554662230847, + "grad_norm": 9.901796340942383, + "learning_rate": 9.38309276317674e-06, + "loss": 1.669909954071045, + "refine_loss": 0.0, + "step": 185 + }, + { + "ce_loss": 0.6189202666282654, + "epoch": 0.0687551982256723, + "grad_norm": 10.916679382324219, + "learning_rate": 9.37527757084375e-06, + "loss": 2.4756810665130615, + "refine_loss": 0.0, + "step": 186 + }, + { + "ce_loss": 0.44106316566467285, + "epoch": 0.06912484982903613, + "grad_norm": 7.622555255889893, + "learning_rate": 9.367416483919387e-06, + "loss": 1.7642526626586914, + "refine_loss": 0.0, + "step": 187 + }, + { + "ce_loss": 0.39499926567077637, + "epoch": 0.06949450143239996, + "grad_norm": 9.216776847839355, + "learning_rate": 9.359509584862735e-06, + "loss": 1.5799970626831055, + "refine_loss": 0.0, + "step": 188 + }, + { + "ce_loss": 0.4929466247558594, + "epoch": 0.06986415303576379, + "grad_norm": 10.007050514221191, + "learning_rate": 9.351556956613423e-06, + "loss": 1.9717864990234375, + "refine_loss": 0.0, + "step": 189 + }, + { + "ce_loss": 0.45609354972839355, + "epoch": 0.07023380463912762, + "grad_norm": 8.76170825958252, + "learning_rate": 9.343558682590757e-06, + "loss": 1.8243741989135742, + "refine_loss": 0.0, + "step": 190 + }, + { + "ce_loss": 0.5057827234268188, + "epoch": 0.07060345624249145, + "grad_norm": 13.169867515563965, + "learning_rate": 9.335514846692846e-06, + "loss": 2.0231308937072754, + "refine_loss": 0.0, + "step": 191 + }, + { + "ce_loss": 0.6233092546463013, + "epoch": 0.07097310784585528, + "grad_norm": 10.759102821350098, + "learning_rate": 9.327425533295725e-06, + "loss": 2.493237018585205, + "refine_loss": 0.0, + "step": 192 + }, + { + "ce_loss": 0.6317358613014221, + "epoch": 0.07134275944921911, + "grad_norm": 9.805523872375488, + "learning_rate": 9.31929082725246e-06, + "loss": 2.5269434452056885, + "refine_loss": 0.0, + "step": 193 + }, + { + "ce_loss": 0.5942422151565552, + "epoch": 0.07171241105258294, + "grad_norm": 12.808365821838379, + "learning_rate": 9.31111081389227e-06, + "loss": 2.3769688606262207, + "refine_loss": 0.0, + "step": 194 + }, + { + "ce_loss": 0.3131611943244934, + "epoch": 0.07208206265594677, + "grad_norm": 7.701195240020752, + "learning_rate": 9.302885579019626e-06, + "loss": 1.2526447772979736, + "refine_loss": 0.0, + "step": 195 + }, + { + "ce_loss": 0.3842979073524475, + "epoch": 0.0724517142593106, + "grad_norm": 8.427416801452637, + "learning_rate": 9.29461520891335e-06, + "loss": 1.53719162940979, + "refine_loss": 0.0, + "step": 196 + }, + { + "ce_loss": 0.559761643409729, + "epoch": 0.07282136586267443, + "grad_norm": 10.913392066955566, + "learning_rate": 9.286299790325708e-06, + "loss": 2.239046573638916, + "refine_loss": 0.0, + "step": 197 + }, + { + "ce_loss": 0.4311313033103943, + "epoch": 0.07319101746603826, + "grad_norm": 9.613964080810547, + "learning_rate": 9.277939410481507e-06, + "loss": 1.7245252132415771, + "refine_loss": 0.0, + "step": 198 + }, + { + "ce_loss": 0.5471141338348389, + "epoch": 0.07356066906940209, + "grad_norm": 9.670456886291504, + "learning_rate": 9.269534157077177e-06, + "loss": 2.1884565353393555, + "refine_loss": 0.0, + "step": 199 + }, + { + "ce_loss": 0.4022981524467468, + "epoch": 0.07393032067276592, + "grad_norm": 10.98754596710205, + "learning_rate": 9.261084118279846e-06, + "loss": 1.6091926097869873, + "refine_loss": 0.0, + "step": 200 + }, + { + "ce_loss": 0.4496643543243408, + "epoch": 0.07429997227612975, + "grad_norm": 9.274353981018066, + "learning_rate": 9.252589382726426e-06, + "loss": 1.7986574172973633, + "refine_loss": 0.0, + "step": 201 + }, + { + "ce_loss": 0.488103985786438, + "epoch": 0.07466962387949358, + "grad_norm": 9.705399513244629, + "learning_rate": 9.244050039522673e-06, + "loss": 1.952415943145752, + "refine_loss": 0.0, + "step": 202 + }, + { + "ce_loss": 0.4514484405517578, + "epoch": 0.07503927548285741, + "grad_norm": 10.720966339111328, + "learning_rate": 9.235466178242255e-06, + "loss": 1.8057937622070312, + "refine_loss": 0.0, + "step": 203 + }, + { + "ce_loss": 0.480252742767334, + "epoch": 0.07540892708622124, + "grad_norm": 9.947574615478516, + "learning_rate": 9.226837888925813e-06, + "loss": 1.921010971069336, + "refine_loss": 0.0, + "step": 204 + }, + { + "ce_loss": 0.3486366271972656, + "epoch": 0.07577857868958507, + "grad_norm": 8.975624084472656, + "learning_rate": 9.218165262080024e-06, + "loss": 1.3945465087890625, + "refine_loss": 0.0, + "step": 205 + }, + { + "ce_loss": 0.46786606311798096, + "epoch": 0.0761482302929489, + "grad_norm": 8.4287748336792, + "learning_rate": 9.209448388676636e-06, + "loss": 1.8714642524719238, + "refine_loss": 0.0, + "step": 206 + }, + { + "ce_loss": 0.530083179473877, + "epoch": 0.07651788189631273, + "grad_norm": 11.402825355529785, + "learning_rate": 9.200687360151527e-06, + "loss": 2.120332717895508, + "refine_loss": 0.0, + "step": 207 + }, + { + "ce_loss": 0.5058252811431885, + "epoch": 0.07688753349967656, + "grad_norm": 11.442806243896484, + "learning_rate": 9.191882268403743e-06, + "loss": 2.023301124572754, + "refine_loss": 0.0, + "step": 208 + }, + { + "ce_loss": 0.5403770208358765, + "epoch": 0.07725718510304039, + "grad_norm": 10.051042556762695, + "learning_rate": 9.183033205794525e-06, + "loss": 2.161508083343506, + "refine_loss": 0.0, + "step": 209 + }, + { + "ce_loss": 0.46942347288131714, + "epoch": 0.07762683670640422, + "grad_norm": 9.43576431274414, + "learning_rate": 9.174140265146356e-06, + "loss": 1.8776938915252686, + "refine_loss": 0.0, + "step": 210 + }, + { + "ce_loss": 0.3604688048362732, + "epoch": 0.07799648830976805, + "grad_norm": 9.28825569152832, + "learning_rate": 9.165203539741976e-06, + "loss": 1.4418752193450928, + "refine_loss": 0.0, + "step": 211 + }, + { + "ce_loss": 0.43255650997161865, + "epoch": 0.07836613991313188, + "grad_norm": 8.690860748291016, + "learning_rate": 9.156223123323405e-06, + "loss": 1.7302260398864746, + "refine_loss": 0.0, + "step": 212 + }, + { + "ce_loss": 0.4279766082763672, + "epoch": 0.0787357915164957, + "grad_norm": 10.115778923034668, + "learning_rate": 9.14719911009096e-06, + "loss": 1.7119064331054688, + "refine_loss": 0.0, + "step": 213 + }, + { + "ce_loss": 0.48570334911346436, + "epoch": 0.07910544311985954, + "grad_norm": 10.525174140930176, + "learning_rate": 9.13813159470227e-06, + "loss": 1.9428133964538574, + "refine_loss": 0.0, + "step": 214 + }, + { + "ce_loss": 0.5443117618560791, + "epoch": 0.07947509472322337, + "grad_norm": 11.55739974975586, + "learning_rate": 9.129020672271283e-06, + "loss": 2.1772470474243164, + "refine_loss": 0.0, + "step": 215 + }, + { + "ce_loss": 0.5341794490814209, + "epoch": 0.0798447463265872, + "grad_norm": 12.412625312805176, + "learning_rate": 9.119866438367263e-06, + "loss": 2.1367177963256836, + "refine_loss": 0.0, + "step": 216 + }, + { + "ce_loss": 0.426436185836792, + "epoch": 0.08021439792995103, + "grad_norm": 9.492140769958496, + "learning_rate": 9.11066898901379e-06, + "loss": 1.705744743347168, + "refine_loss": 0.0, + "step": 217 + }, + { + "ce_loss": 0.33316802978515625, + "epoch": 0.08058404953331486, + "grad_norm": 8.326518058776855, + "learning_rate": 9.101428420687759e-06, + "loss": 1.332672119140625, + "refine_loss": 0.0, + "step": 218 + }, + { + "ce_loss": 0.3220123052597046, + "epoch": 0.08095370113667869, + "grad_norm": 6.819008827209473, + "learning_rate": 9.092144830318357e-06, + "loss": 1.2880492210388184, + "refine_loss": 0.0, + "step": 219 + }, + { + "ce_loss": 0.28512728214263916, + "epoch": 0.08132335274004251, + "grad_norm": 8.26205825805664, + "learning_rate": 9.082818315286054e-06, + "loss": 1.1405091285705566, + "refine_loss": 0.0, + "step": 220 + }, + { + "ce_loss": 0.40984106063842773, + "epoch": 0.08169300434340634, + "grad_norm": 9.89958667755127, + "learning_rate": 9.073448973421581e-06, + "loss": 1.639364242553711, + "refine_loss": 0.0, + "step": 221 + }, + { + "ce_loss": 0.5046243667602539, + "epoch": 0.08206265594677017, + "grad_norm": 11.437060356140137, + "learning_rate": 9.0640369030049e-06, + "loss": 2.0184974670410156, + "refine_loss": 0.0, + "step": 222 + }, + { + "ce_loss": 0.40322911739349365, + "epoch": 0.082432307550134, + "grad_norm": 8.743781089782715, + "learning_rate": 9.054582202764175e-06, + "loss": 1.6129164695739746, + "refine_loss": 0.0, + "step": 223 + }, + { + "ce_loss": 0.22132658958435059, + "epoch": 0.08280195915349783, + "grad_norm": 6.564306735992432, + "learning_rate": 9.045084971874738e-06, + "loss": 0.8853063583374023, + "refine_loss": 0.0, + "step": 224 + }, + { + "ce_loss": 0.5105712413787842, + "epoch": 0.08317161075686166, + "grad_norm": 9.30221939086914, + "learning_rate": 9.035545309958048e-06, + "loss": 2.0422849655151367, + "refine_loss": 0.0, + "step": 225 + }, + { + "ce_loss": 0.46421217918395996, + "epoch": 0.0835412623602255, + "grad_norm": 9.150330543518066, + "learning_rate": 9.025963317080641e-06, + "loss": 1.8568487167358398, + "refine_loss": 0.0, + "step": 226 + }, + { + "ce_loss": 0.46818113327026367, + "epoch": 0.08391091396358932, + "grad_norm": 12.289566993713379, + "learning_rate": 9.016339093753093e-06, + "loss": 1.8727245330810547, + "refine_loss": 0.0, + "step": 227 + }, + { + "ce_loss": 0.39228200912475586, + "epoch": 0.08428056556695315, + "grad_norm": 8.975275993347168, + "learning_rate": 9.006672740928952e-06, + "loss": 1.5691280364990234, + "refine_loss": 0.0, + "step": 228 + }, + { + "ce_loss": 0.4591090679168701, + "epoch": 0.08465021717031698, + "grad_norm": 12.153667449951172, + "learning_rate": 8.99696436000368e-06, + "loss": 1.8364362716674805, + "refine_loss": 0.0, + "step": 229 + }, + { + "ce_loss": 0.38733434677124023, + "epoch": 0.08501986877368081, + "grad_norm": 10.18800163269043, + "learning_rate": 8.987214052813605e-06, + "loss": 1.549337387084961, + "refine_loss": 0.0, + "step": 230 + }, + { + "ce_loss": 0.4616786241531372, + "epoch": 0.08538952037704464, + "grad_norm": 8.207650184631348, + "learning_rate": 8.977421921634833e-06, + "loss": 1.8467144966125488, + "refine_loss": 0.0, + "step": 231 + }, + { + "ce_loss": 0.5613800287246704, + "epoch": 0.08575917198040847, + "grad_norm": 11.400620460510254, + "learning_rate": 8.967588069182184e-06, + "loss": 2.2455201148986816, + "refine_loss": 0.0, + "step": 232 + }, + { + "ce_loss": 0.3693675994873047, + "epoch": 0.0861288235837723, + "grad_norm": 7.9220380783081055, + "learning_rate": 8.957712598608123e-06, + "loss": 1.4774703979492188, + "refine_loss": 0.0, + "step": 233 + }, + { + "ce_loss": 0.47067689895629883, + "epoch": 0.08649847518713612, + "grad_norm": 16.614206314086914, + "learning_rate": 8.947795613501658e-06, + "loss": 1.8827075958251953, + "refine_loss": 0.0, + "step": 234 + }, + { + "ce_loss": 0.2858662009239197, + "epoch": 0.08686812679049995, + "grad_norm": 8.299520492553711, + "learning_rate": 8.937837217887273e-06, + "loss": 1.1434648036956787, + "refine_loss": 0.0, + "step": 235 + }, + { + "ce_loss": 0.4617964029312134, + "epoch": 0.08723777839386378, + "grad_norm": 8.04849910736084, + "learning_rate": 8.927837516223824e-06, + "loss": 1.8471856117248535, + "refine_loss": 0.0, + "step": 236 + }, + { + "ce_loss": 0.44788455963134766, + "epoch": 0.0876074299972276, + "grad_norm": 9.371988296508789, + "learning_rate": 8.917796613403451e-06, + "loss": 1.7915382385253906, + "refine_loss": 0.0, + "step": 237 + }, + { + "ce_loss": 0.4478434920310974, + "epoch": 0.08797708160059144, + "grad_norm": 9.542847633361816, + "learning_rate": 8.907714614750473e-06, + "loss": 1.7913739681243896, + "refine_loss": 0.0, + "step": 238 + }, + { + "ce_loss": 0.5488250255584717, + "epoch": 0.08834673320395527, + "grad_norm": 9.448659896850586, + "learning_rate": 8.897591626020284e-06, + "loss": 2.1953001022338867, + "refine_loss": 0.0, + "step": 239 + }, + { + "ce_loss": 0.3678814172744751, + "epoch": 0.0887163848073191, + "grad_norm": 8.709835052490234, + "learning_rate": 8.887427753398249e-06, + "loss": 1.4715256690979004, + "refine_loss": 0.0, + "step": 240 + }, + { + "ce_loss": 0.42265528440475464, + "epoch": 0.08908603641068293, + "grad_norm": 7.642331123352051, + "learning_rate": 8.877223103498576e-06, + "loss": 1.6906211376190186, + "refine_loss": 0.0, + "step": 241 + }, + { + "ce_loss": 0.5628585815429688, + "epoch": 0.08945568801404676, + "grad_norm": 9.470221519470215, + "learning_rate": 8.866977783363219e-06, + "loss": 2.251434326171875, + "refine_loss": 0.0, + "step": 242 + }, + { + "ce_loss": 0.7401205897331238, + "epoch": 0.08982533961741059, + "grad_norm": 12.376501083374023, + "learning_rate": 8.85669190046074e-06, + "loss": 2.960482358932495, + "refine_loss": 0.0, + "step": 243 + }, + { + "ce_loss": 0.4706430435180664, + "epoch": 0.09019499122077441, + "grad_norm": 9.771367073059082, + "learning_rate": 8.846365562685178e-06, + "loss": 1.8825721740722656, + "refine_loss": 0.0, + "step": 244 + }, + { + "ce_loss": 0.35656917095184326, + "epoch": 0.09056464282413824, + "grad_norm": 7.289247035980225, + "learning_rate": 8.83599887835493e-06, + "loss": 1.426276683807373, + "refine_loss": 0.0, + "step": 245 + }, + { + "ce_loss": 0.5782537460327148, + "epoch": 0.09093429442750207, + "grad_norm": 11.25285816192627, + "learning_rate": 8.825591956211614e-06, + "loss": 2.3130149841308594, + "refine_loss": 0.0, + "step": 246 + }, + { + "ce_loss": 0.4734983444213867, + "epoch": 0.0913039460308659, + "grad_norm": 9.553759574890137, + "learning_rate": 8.815144905418918e-06, + "loss": 1.8939933776855469, + "refine_loss": 0.0, + "step": 247 + }, + { + "ce_loss": 0.46240925788879395, + "epoch": 0.09167359763422973, + "grad_norm": 9.479375839233398, + "learning_rate": 8.804657835561456e-06, + "loss": 1.8496370315551758, + "refine_loss": 0.0, + "step": 248 + }, + { + "ce_loss": 0.40285658836364746, + "epoch": 0.09204324923759356, + "grad_norm": 9.462051391601562, + "learning_rate": 8.794130856643635e-06, + "loss": 1.6114263534545898, + "refine_loss": 0.0, + "step": 249 + }, + { + "ce_loss": 0.38945889472961426, + "epoch": 0.0924129008409574, + "grad_norm": 9.822796821594238, + "learning_rate": 8.783564079088478e-06, + "loss": 1.557835578918457, + "refine_loss": 0.0, + "step": 250 + }, + { + "ce_loss": 0.3760809898376465, + "epoch": 0.09278255244432122, + "grad_norm": 8.81179141998291, + "learning_rate": 8.772957613736483e-06, + "loss": 1.504323959350586, + "refine_loss": 0.0, + "step": 251 + }, + { + "ce_loss": 0.560760498046875, + "epoch": 0.09315220404768505, + "grad_norm": 20.687103271484375, + "learning_rate": 8.762311571844453e-06, + "loss": 2.2430419921875, + "refine_loss": 0.0, + "step": 252 + }, + { + "ce_loss": 0.5355932712554932, + "epoch": 0.09352185565104888, + "grad_norm": 9.03292465209961, + "learning_rate": 8.751626065084328e-06, + "loss": 2.1423730850219727, + "refine_loss": 0.0, + "step": 253 + }, + { + "ce_loss": 0.3351249694824219, + "epoch": 0.09389150725441271, + "grad_norm": 9.206958770751953, + "learning_rate": 8.74090120554202e-06, + "loss": 1.3404998779296875, + "refine_loss": 0.0, + "step": 254 + }, + { + "ce_loss": 0.48261046409606934, + "epoch": 0.09426115885777654, + "grad_norm": 11.454245567321777, + "learning_rate": 8.730137105716231e-06, + "loss": 1.9304418563842773, + "refine_loss": 0.0, + "step": 255 + }, + { + "ce_loss": 0.5166399478912354, + "epoch": 0.09463081046114037, + "grad_norm": 11.257972717285156, + "learning_rate": 8.719333878517274e-06, + "loss": 2.0665597915649414, + "refine_loss": 0.0, + "step": 256 + }, + { + "ce_loss": 0.32494664192199707, + "epoch": 0.0950004620645042, + "grad_norm": 7.650129795074463, + "learning_rate": 8.708491637265888e-06, + "loss": 1.2997865676879883, + "refine_loss": 0.0, + "step": 257 + }, + { + "ce_loss": 0.3243582248687744, + "epoch": 0.09537011366786803, + "grad_norm": 7.765240669250488, + "learning_rate": 8.697610495692055e-06, + "loss": 1.2974328994750977, + "refine_loss": 0.0, + "step": 258 + }, + { + "ce_loss": 0.4705219268798828, + "epoch": 0.09573976527123186, + "grad_norm": 10.238361358642578, + "learning_rate": 8.686690567933803e-06, + "loss": 1.8820877075195312, + "refine_loss": 0.0, + "step": 259 + }, + { + "ce_loss": 0.49955296516418457, + "epoch": 0.09610941687459569, + "grad_norm": 9.410140037536621, + "learning_rate": 8.675731968536004e-06, + "loss": 1.9982118606567383, + "refine_loss": 0.0, + "step": 260 + }, + { + "ce_loss": 0.6056475639343262, + "epoch": 0.09647906847795952, + "grad_norm": 10.697522163391113, + "learning_rate": 8.66473481244918e-06, + "loss": 2.4225902557373047, + "refine_loss": 0.0, + "step": 261 + }, + { + "ce_loss": 0.33145058155059814, + "epoch": 0.09684872008132335, + "grad_norm": 10.647120475769043, + "learning_rate": 8.653699215028298e-06, + "loss": 1.3258023262023926, + "refine_loss": 0.0, + "step": 262 + }, + { + "ce_loss": 0.5288515090942383, + "epoch": 0.09721837168468718, + "grad_norm": 13.26386833190918, + "learning_rate": 8.64262529203155e-06, + "loss": 2.115406036376953, + "refine_loss": 0.0, + "step": 263 + }, + { + "ce_loss": 0.4583320617675781, + "epoch": 0.09758802328805101, + "grad_norm": 8.062484741210938, + "learning_rate": 8.63151315961915e-06, + "loss": 1.8333282470703125, + "refine_loss": 0.0, + "step": 264 + }, + { + "ce_loss": 0.6104307174682617, + "epoch": 0.09795767489141484, + "grad_norm": 8.833436965942383, + "learning_rate": 8.620362934352109e-06, + "loss": 2.441722869873047, + "refine_loss": 0.0, + "step": 265 + }, + { + "ce_loss": 0.4519071578979492, + "epoch": 0.09832732649477867, + "grad_norm": 8.239848136901855, + "learning_rate": 8.609174733191012e-06, + "loss": 1.8076286315917969, + "refine_loss": 0.0, + "step": 266 + }, + { + "ce_loss": 0.4034709930419922, + "epoch": 0.0986969780981425, + "grad_norm": 7.666320323944092, + "learning_rate": 8.597948673494794e-06, + "loss": 1.6138839721679688, + "refine_loss": 0.0, + "step": 267 + }, + { + "ce_loss": 0.49150681495666504, + "epoch": 0.09906662970150633, + "grad_norm": 8.702020645141602, + "learning_rate": 8.586684873019513e-06, + "loss": 1.9660272598266602, + "refine_loss": 0.0, + "step": 268 + }, + { + "ce_loss": 0.5032563209533691, + "epoch": 0.09943628130487016, + "grad_norm": 8.703141212463379, + "learning_rate": 8.575383449917103e-06, + "loss": 2.0130252838134766, + "refine_loss": 0.0, + "step": 269 + }, + { + "ce_loss": 0.5780043601989746, + "epoch": 0.09980593290823399, + "grad_norm": 10.88441276550293, + "learning_rate": 8.564044522734147e-06, + "loss": 2.3120174407958984, + "refine_loss": 0.0, + "step": 270 + }, + { + "ce_loss": 0.5433449745178223, + "epoch": 0.10017558451159782, + "grad_norm": 8.907797813415527, + "learning_rate": 8.552668210410624e-06, + "loss": 2.173379898071289, + "refine_loss": 0.0, + "step": 271 + }, + { + "ce_loss": 0.3618725538253784, + "epoch": 0.10054523611496165, + "grad_norm": 7.457752704620361, + "learning_rate": 8.541254632278667e-06, + "loss": 1.4474902153015137, + "refine_loss": 0.0, + "step": 272 + }, + { + "ce_loss": 0.5376615524291992, + "epoch": 0.10091488771832548, + "grad_norm": 8.432844161987305, + "learning_rate": 8.52980390806131e-06, + "loss": 2.150646209716797, + "refine_loss": 0.0, + "step": 273 + }, + { + "ce_loss": 0.3972291946411133, + "epoch": 0.10128453932168931, + "grad_norm": 7.178942680358887, + "learning_rate": 8.518316157871232e-06, + "loss": 1.5889167785644531, + "refine_loss": 0.0, + "step": 274 + }, + { + "ce_loss": 0.4685020446777344, + "epoch": 0.10165419092505314, + "grad_norm": 9.789877891540527, + "learning_rate": 8.506791502209497e-06, + "loss": 1.8740081787109375, + "refine_loss": 0.0, + "step": 275 + }, + { + "ce_loss": 0.4952571392059326, + "epoch": 0.10202384252841697, + "grad_norm": 10.692804336547852, + "learning_rate": 8.495230061964289e-06, + "loss": 1.9810285568237305, + "refine_loss": 0.0, + "step": 276 + }, + { + "ce_loss": 0.3758821487426758, + "epoch": 0.1023934941317808, + "grad_norm": 11.113665580749512, + "learning_rate": 8.483631958409644e-06, + "loss": 1.5035285949707031, + "refine_loss": 0.0, + "step": 277 + }, + { + "ce_loss": 0.5127402544021606, + "epoch": 0.10276314573514463, + "grad_norm": 10.648669242858887, + "learning_rate": 8.471997313204183e-06, + "loss": 2.0509610176086426, + "refine_loss": 0.0, + "step": 278 + }, + { + "ce_loss": 0.3587445914745331, + "epoch": 0.10313279733850846, + "grad_norm": 8.7131986618042, + "learning_rate": 8.460326248389825e-06, + "loss": 1.4349783658981323, + "refine_loss": 0.0, + "step": 279 + }, + { + "ce_loss": 0.36674022674560547, + "epoch": 0.10350244894187229, + "grad_norm": 6.720057010650635, + "learning_rate": 8.448618886390523e-06, + "loss": 1.4669609069824219, + "refine_loss": 0.0, + "step": 280 + }, + { + "ce_loss": 0.4018174409866333, + "epoch": 0.10387210054523612, + "grad_norm": 6.930964469909668, + "learning_rate": 8.436875350010958e-06, + "loss": 1.6072697639465332, + "refine_loss": 0.0, + "step": 281 + }, + { + "ce_loss": 0.41391921043395996, + "epoch": 0.10424175214859994, + "grad_norm": 8.765676498413086, + "learning_rate": 8.425095762435274e-06, + "loss": 1.6556768417358398, + "refine_loss": 0.0, + "step": 282 + }, + { + "ce_loss": 0.5332019329071045, + "epoch": 0.10461140375196377, + "grad_norm": 8.878912925720215, + "learning_rate": 8.41328024722577e-06, + "loss": 2.132807731628418, + "refine_loss": 0.0, + "step": 283 + }, + { + "ce_loss": 0.44132518768310547, + "epoch": 0.1049810553553276, + "grad_norm": 8.45478343963623, + "learning_rate": 8.401428928321607e-06, + "loss": 1.7653007507324219, + "refine_loss": 0.0, + "step": 284 + }, + { + "ce_loss": 0.5140833854675293, + "epoch": 0.10535070695869143, + "grad_norm": 9.627408027648926, + "learning_rate": 8.389541930037516e-06, + "loss": 2.056333541870117, + "refine_loss": 0.0, + "step": 285 + }, + { + "ce_loss": 0.4408755302429199, + "epoch": 0.10572035856205526, + "grad_norm": 10.905773162841797, + "learning_rate": 8.377619377062483e-06, + "loss": 1.7635021209716797, + "refine_loss": 0.0, + "step": 286 + }, + { + "ce_loss": 0.4818708896636963, + "epoch": 0.1060900101654191, + "grad_norm": 8.665632247924805, + "learning_rate": 8.365661394458446e-06, + "loss": 1.9274835586547852, + "refine_loss": 0.0, + "step": 287 + }, + { + "ce_loss": 0.32633745670318604, + "epoch": 0.10645966176878292, + "grad_norm": 8.926676750183105, + "learning_rate": 8.353668107658984e-06, + "loss": 1.3053498268127441, + "refine_loss": 0.0, + "step": 288 + }, + { + "ce_loss": 0.4175689220428467, + "epoch": 0.10682931337214675, + "grad_norm": 7.890301704406738, + "learning_rate": 8.341639642468002e-06, + "loss": 1.6702756881713867, + "refine_loss": 0.0, + "step": 289 + }, + { + "ce_loss": 0.6209136843681335, + "epoch": 0.10719896497551058, + "grad_norm": 10.270397186279297, + "learning_rate": 8.329576125058406e-06, + "loss": 2.483654737472534, + "refine_loss": 0.0, + "step": 290 + }, + { + "ce_loss": 0.3677471876144409, + "epoch": 0.10756861657887441, + "grad_norm": 11.883509635925293, + "learning_rate": 8.317477681970786e-06, + "loss": 1.4709887504577637, + "refine_loss": 0.0, + "step": 291 + }, + { + "ce_loss": 0.5355865955352783, + "epoch": 0.10793826818223824, + "grad_norm": 16.64803695678711, + "learning_rate": 8.305344440112089e-06, + "loss": 2.1423463821411133, + "refine_loss": 0.0, + "step": 292 + }, + { + "ce_loss": 0.4296642541885376, + "epoch": 0.10830791978560207, + "grad_norm": 8.534472465515137, + "learning_rate": 8.293176526754274e-06, + "loss": 1.7186570167541504, + "refine_loss": 0.0, + "step": 293 + }, + { + "ce_loss": 0.359022319316864, + "epoch": 0.1086775713889659, + "grad_norm": 8.617088317871094, + "learning_rate": 8.280974069532999e-06, + "loss": 1.436089277267456, + "refine_loss": 0.0, + "step": 294 + }, + { + "ce_loss": 0.5393202304840088, + "epoch": 0.10904722299232973, + "grad_norm": 8.658536911010742, + "learning_rate": 8.268737196446264e-06, + "loss": 2.157280921936035, + "refine_loss": 0.0, + "step": 295 + }, + { + "ce_loss": 0.562441349029541, + "epoch": 0.10941687459569356, + "grad_norm": 10.44002914428711, + "learning_rate": 8.256466035853077e-06, + "loss": 2.249765396118164, + "refine_loss": 0.0, + "step": 296 + }, + { + "ce_loss": 0.4483368396759033, + "epoch": 0.10978652619905739, + "grad_norm": 8.190154075622559, + "learning_rate": 8.244160716472109e-06, + "loss": 1.7933473587036133, + "refine_loss": 0.0, + "step": 297 + }, + { + "ce_loss": 0.3947460651397705, + "epoch": 0.11015617780242122, + "grad_norm": 8.639655113220215, + "learning_rate": 8.231821367380335e-06, + "loss": 1.578984260559082, + "refine_loss": 0.0, + "step": 298 + }, + { + "ce_loss": 0.532772421836853, + "epoch": 0.11052582940578505, + "grad_norm": 10.298441886901855, + "learning_rate": 8.219448118011687e-06, + "loss": 2.131089687347412, + "refine_loss": 0.0, + "step": 299 + }, + { + "ce_loss": 0.3380366563796997, + "epoch": 0.11089548100914888, + "grad_norm": 8.979413032531738, + "learning_rate": 8.207041098155701e-06, + "loss": 1.3521466255187988, + "refine_loss": 0.0, + "step": 300 + }, + { + "ce_loss": 0.3165137767791748, + "epoch": 0.11126513261251271, + "grad_norm": 6.880032539367676, + "learning_rate": 8.19460043795614e-06, + "loss": 1.2660551071166992, + "refine_loss": 0.0, + "step": 301 + }, + { + "ce_loss": 0.37179386615753174, + "epoch": 0.11163478421587654, + "grad_norm": 8.511785507202148, + "learning_rate": 8.182126267909642e-06, + "loss": 1.487175464630127, + "refine_loss": 0.0, + "step": 302 + }, + { + "ce_loss": 0.5323119163513184, + "epoch": 0.11200443581924037, + "grad_norm": 8.984885215759277, + "learning_rate": 8.16961871886435e-06, + "loss": 2.1292476654052734, + "refine_loss": 0.0, + "step": 303 + }, + { + "ce_loss": 0.4188227653503418, + "epoch": 0.1123740874226042, + "grad_norm": 7.3782429695129395, + "learning_rate": 8.157077922018537e-06, + "loss": 1.6752910614013672, + "refine_loss": 0.0, + "step": 304 + }, + { + "ce_loss": 0.2845418453216553, + "epoch": 0.11274373902596803, + "grad_norm": 8.347393989562988, + "learning_rate": 8.144504008919224e-06, + "loss": 1.138167381286621, + "refine_loss": 0.0, + "step": 305 + }, + { + "ce_loss": 0.41903769969940186, + "epoch": 0.11311339062933186, + "grad_norm": 7.520823955535889, + "learning_rate": 8.13189711146081e-06, + "loss": 1.6761507987976074, + "refine_loss": 0.0, + "step": 306 + }, + { + "ce_loss": 0.43468451499938965, + "epoch": 0.11348304223269569, + "grad_norm": 12.214486122131348, + "learning_rate": 8.119257361883686e-06, + "loss": 1.7387380599975586, + "refine_loss": 0.0, + "step": 307 + }, + { + "ce_loss": 0.5317587852478027, + "epoch": 0.11385269383605952, + "grad_norm": 8.547225952148438, + "learning_rate": 8.106584892772844e-06, + "loss": 2.127035140991211, + "refine_loss": 0.0, + "step": 308 + }, + { + "ce_loss": 0.4470992088317871, + "epoch": 0.11422234543942335, + "grad_norm": 12.326611518859863, + "learning_rate": 8.093879837056486e-06, + "loss": 1.7883968353271484, + "refine_loss": 0.0, + "step": 309 + }, + { + "ce_loss": 0.2728137969970703, + "epoch": 0.11459199704278718, + "grad_norm": 7.99282693862915, + "learning_rate": 8.081142328004638e-06, + "loss": 1.0912551879882812, + "refine_loss": 0.0, + "step": 310 + }, + { + "ce_loss": 0.5302568674087524, + "epoch": 0.11496164864615101, + "grad_norm": 12.690000534057617, + "learning_rate": 8.068372499227738e-06, + "loss": 2.1210274696350098, + "refine_loss": 0.0, + "step": 311 + }, + { + "ce_loss": 0.5564645528793335, + "epoch": 0.11533130024951484, + "grad_norm": 16.117847442626953, + "learning_rate": 8.055570484675252e-06, + "loss": 2.225858211517334, + "refine_loss": 0.0, + "step": 312 + }, + { + "ce_loss": 0.4740626811981201, + "epoch": 0.11570095185287867, + "grad_norm": 8.95023250579834, + "learning_rate": 8.042736418634252e-06, + "loss": 1.8962507247924805, + "refine_loss": 0.0, + "step": 313 + }, + { + "ce_loss": 0.3715481758117676, + "epoch": 0.1160706034562425, + "grad_norm": 8.337223052978516, + "learning_rate": 8.029870435728018e-06, + "loss": 1.4861927032470703, + "refine_loss": 0.0, + "step": 314 + }, + { + "ce_loss": 0.47854557633399963, + "epoch": 0.11644025505960633, + "grad_norm": 9.77867317199707, + "learning_rate": 8.016972670914624e-06, + "loss": 1.9141823053359985, + "refine_loss": 0.0, + "step": 315 + }, + { + "ce_loss": 0.5768280029296875, + "epoch": 0.11680990666297016, + "grad_norm": 9.597472190856934, + "learning_rate": 8.004043259485519e-06, + "loss": 2.30731201171875, + "refine_loss": 0.0, + "step": 316 + }, + { + "ce_loss": 0.412142276763916, + "epoch": 0.11717955826633399, + "grad_norm": 8.377786636352539, + "learning_rate": 7.99108233706411e-06, + "loss": 1.648569107055664, + "refine_loss": 0.0, + "step": 317 + }, + { + "ce_loss": 0.27794015407562256, + "epoch": 0.11754920986969782, + "grad_norm": 12.236888885498047, + "learning_rate": 7.978090039604342e-06, + "loss": 1.1117606163024902, + "refine_loss": 0.0, + "step": 318 + }, + { + "ce_loss": 0.43860429525375366, + "epoch": 0.11791886147306165, + "grad_norm": 9.728561401367188, + "learning_rate": 7.965066503389264e-06, + "loss": 1.7544171810150146, + "refine_loss": 0.0, + "step": 319 + }, + { + "ce_loss": 0.5506472587585449, + "epoch": 0.11828851307642547, + "grad_norm": 10.485636711120605, + "learning_rate": 7.952011865029614e-06, + "loss": 2.2025890350341797, + "refine_loss": 0.0, + "step": 320 + }, + { + "ce_loss": 0.43887174129486084, + "epoch": 0.1186581646797893, + "grad_norm": 18.824281692504883, + "learning_rate": 7.938926261462366e-06, + "loss": 1.7554869651794434, + "refine_loss": 0.0, + "step": 321 + }, + { + "ce_loss": 0.374334454536438, + "epoch": 0.11902781628315313, + "grad_norm": 7.4034600257873535, + "learning_rate": 7.925809829949312e-06, + "loss": 1.497337818145752, + "refine_loss": 0.0, + "step": 322 + }, + { + "ce_loss": 0.49332761764526367, + "epoch": 0.11939746788651696, + "grad_norm": 10.19067668914795, + "learning_rate": 7.91266270807561e-06, + "loss": 1.9733104705810547, + "refine_loss": 0.0, + "step": 323 + }, + { + "ce_loss": 0.6006746292114258, + "epoch": 0.1197671194898808, + "grad_norm": 16.765464782714844, + "learning_rate": 7.89948503374835e-06, + "loss": 2.402698516845703, + "refine_loss": 0.0, + "step": 324 + }, + { + "ce_loss": 0.43160247802734375, + "epoch": 0.12013677109324462, + "grad_norm": 10.903677940368652, + "learning_rate": 7.886276945195098e-06, + "loss": 1.726409912109375, + "refine_loss": 0.0, + "step": 325 + }, + { + "ce_loss": 0.38602232933044434, + "epoch": 0.12050642269660845, + "grad_norm": 15.02309513092041, + "learning_rate": 7.873038580962453e-06, + "loss": 1.5440893173217773, + "refine_loss": 0.0, + "step": 326 + }, + { + "ce_loss": 0.3634631633758545, + "epoch": 0.12087607429997227, + "grad_norm": 8.714849472045898, + "learning_rate": 7.859770079914592e-06, + "loss": 1.453852653503418, + "refine_loss": 0.0, + "step": 327 + }, + { + "ce_loss": 0.4161427617073059, + "epoch": 0.1212457259033361, + "grad_norm": 8.797317504882812, + "learning_rate": 7.846471581231814e-06, + "loss": 1.6645710468292236, + "refine_loss": 0.0, + "step": 328 + }, + { + "ce_loss": 0.5999033451080322, + "epoch": 0.12161537750669993, + "grad_norm": 11.13209342956543, + "learning_rate": 7.833143224409076e-06, + "loss": 2.399613380432129, + "refine_loss": 0.0, + "step": 329 + }, + { + "ce_loss": 0.39738941192626953, + "epoch": 0.12198502911006376, + "grad_norm": 14.08590030670166, + "learning_rate": 7.819785149254534e-06, + "loss": 1.5895576477050781, + "refine_loss": 0.0, + "step": 330 + }, + { + "ce_loss": 0.33948755264282227, + "epoch": 0.12235468071342759, + "grad_norm": 8.664375305175781, + "learning_rate": 7.806397495888074e-06, + "loss": 1.357950210571289, + "refine_loss": 0.0, + "step": 331 + }, + { + "ce_loss": 0.3225889205932617, + "epoch": 0.12272433231679142, + "grad_norm": 11.298969268798828, + "learning_rate": 7.792980404739849e-06, + "loss": 1.2903556823730469, + "refine_loss": 0.0, + "step": 332 + }, + { + "ce_loss": 0.39954984188079834, + "epoch": 0.12309398392015525, + "grad_norm": 8.629815101623535, + "learning_rate": 7.779534016548791e-06, + "loss": 1.5981993675231934, + "refine_loss": 0.0, + "step": 333 + }, + { + "ce_loss": 0.38355565071105957, + "epoch": 0.12346363552351908, + "grad_norm": 6.805602073669434, + "learning_rate": 7.766058472361154e-06, + "loss": 1.5342226028442383, + "refine_loss": 0.0, + "step": 334 + }, + { + "ce_loss": 0.5514776706695557, + "epoch": 0.12383328712688291, + "grad_norm": 10.065786361694336, + "learning_rate": 7.752553913529019e-06, + "loss": 2.2059106826782227, + "refine_loss": 0.0, + "step": 335 + }, + { + "ce_loss": 0.4699883460998535, + "epoch": 0.12420293873024674, + "grad_norm": 7.466202735900879, + "learning_rate": 7.739020481708816e-06, + "loss": 1.879953384399414, + "refine_loss": 0.0, + "step": 336 + }, + { + "ce_loss": 0.6298911571502686, + "epoch": 0.12457259033361057, + "grad_norm": 14.050506591796875, + "learning_rate": 7.725458318859842e-06, + "loss": 2.519564628601074, + "refine_loss": 0.0, + "step": 337 + }, + { + "ce_loss": 0.5235673189163208, + "epoch": 0.1249422419369744, + "grad_norm": 11.27765941619873, + "learning_rate": 7.711867567242769e-06, + "loss": 2.094269275665283, + "refine_loss": 0.0, + "step": 338 + }, + { + "ce_loss": 0.3922382593154907, + "epoch": 0.12531189354033823, + "grad_norm": 8.213973045349121, + "learning_rate": 7.698248369418146e-06, + "loss": 1.568953037261963, + "refine_loss": 0.0, + "step": 339 + }, + { + "ce_loss": 0.44464361667633057, + "epoch": 0.12568154514370206, + "grad_norm": 7.8737359046936035, + "learning_rate": 7.68460086824492e-06, + "loss": 1.7785744667053223, + "refine_loss": 0.0, + "step": 340 + }, + { + "ce_loss": 0.46274518966674805, + "epoch": 0.12605119674706589, + "grad_norm": 9.076868057250977, + "learning_rate": 7.670925206878917e-06, + "loss": 1.8509807586669922, + "refine_loss": 0.0, + "step": 341 + }, + { + "ce_loss": 0.3430368900299072, + "epoch": 0.12642084835042972, + "grad_norm": 8.619489669799805, + "learning_rate": 7.657221528771352e-06, + "loss": 1.372147560119629, + "refine_loss": 0.0, + "step": 342 + }, + { + "ce_loss": 0.5602612495422363, + "epoch": 0.12679049995379355, + "grad_norm": 15.776405334472656, + "learning_rate": 7.643489977667327e-06, + "loss": 2.2410449981689453, + "refine_loss": 0.0, + "step": 343 + }, + { + "ce_loss": 0.44336050748825073, + "epoch": 0.12716015155715737, + "grad_norm": 9.020859718322754, + "learning_rate": 7.629730697604314e-06, + "loss": 1.773442029953003, + "refine_loss": 0.0, + "step": 344 + }, + { + "ce_loss": 0.5886321067810059, + "epoch": 0.1275298031605212, + "grad_norm": 8.447283744812012, + "learning_rate": 7.61594383291065e-06, + "loss": 2.3545284271240234, + "refine_loss": 0.0, + "step": 345 + }, + { + "ce_loss": 0.4461359977722168, + "epoch": 0.12789945476388503, + "grad_norm": 8.415456771850586, + "learning_rate": 7.602129528204023e-06, + "loss": 1.7845439910888672, + "refine_loss": 0.0, + "step": 346 + }, + { + "ce_loss": 0.29878830909729004, + "epoch": 0.12826910636724886, + "grad_norm": 7.092437267303467, + "learning_rate": 7.588287928389952e-06, + "loss": 1.1951532363891602, + "refine_loss": 0.0, + "step": 347 + }, + { + "ce_loss": 0.4129582643508911, + "epoch": 0.1286387579706127, + "grad_norm": 6.72657585144043, + "learning_rate": 7.574419178660269e-06, + "loss": 1.6518330574035645, + "refine_loss": 0.0, + "step": 348 + }, + { + "ce_loss": 0.48720407485961914, + "epoch": 0.12900840957397652, + "grad_norm": 10.361894607543945, + "learning_rate": 7.560523424491595e-06, + "loss": 1.9488162994384766, + "refine_loss": 0.0, + "step": 349 + }, + { + "ce_loss": 0.32165980339050293, + "epoch": 0.12937806117734035, + "grad_norm": 7.843476295471191, + "learning_rate": 7.546600811643816e-06, + "loss": 1.2866392135620117, + "refine_loss": 0.0, + "step": 350 + }, + { + "ce_loss": 0.44182366132736206, + "epoch": 0.12974771278070418, + "grad_norm": 8.648327827453613, + "learning_rate": 7.532651486158554e-06, + "loss": 1.7672946453094482, + "refine_loss": 0.0, + "step": 351 + }, + { + "ce_loss": 0.4402346611022949, + "epoch": 0.130117364384068, + "grad_norm": 9.440979957580566, + "learning_rate": 7.5186755943576324e-06, + "loss": 1.7609386444091797, + "refine_loss": 0.0, + "step": 352 + }, + { + "ce_loss": 0.45134496688842773, + "epoch": 0.13048701598743184, + "grad_norm": 9.14094066619873, + "learning_rate": 7.504673282841544e-06, + "loss": 1.805379867553711, + "refine_loss": 0.0, + "step": 353 + }, + { + "ce_loss": 0.3473091125488281, + "epoch": 0.13085666759079567, + "grad_norm": 9.138503074645996, + "learning_rate": 7.490644698487909e-06, + "loss": 1.3892364501953125, + "refine_loss": 0.0, + "step": 354 + }, + { + "ce_loss": 0.43794190883636475, + "epoch": 0.1312263191941595, + "grad_norm": 8.681024551391602, + "learning_rate": 7.476589988449939e-06, + "loss": 1.751767635345459, + "refine_loss": 0.0, + "step": 355 + }, + { + "ce_loss": 0.38267195224761963, + "epoch": 0.13159597079752333, + "grad_norm": 7.119711875915527, + "learning_rate": 7.462509300154892e-06, + "loss": 1.5306878089904785, + "refine_loss": 0.0, + "step": 356 + }, + { + "ce_loss": 0.4581352472305298, + "epoch": 0.13196562240088716, + "grad_norm": 9.239828109741211, + "learning_rate": 7.448402781302526e-06, + "loss": 1.8325409889221191, + "refine_loss": 0.0, + "step": 357 + }, + { + "ce_loss": 0.4802442193031311, + "epoch": 0.132335274004251, + "grad_norm": 9.309179306030273, + "learning_rate": 7.434270579863549e-06, + "loss": 1.9209768772125244, + "refine_loss": 0.0, + "step": 358 + }, + { + "ce_loss": 0.5408854484558105, + "epoch": 0.13270492560761482, + "grad_norm": 7.973039627075195, + "learning_rate": 7.420112844078066e-06, + "loss": 2.163541793823242, + "refine_loss": 0.0, + "step": 359 + }, + { + "ce_loss": 0.3056582808494568, + "epoch": 0.13307457721097865, + "grad_norm": 8.10049819946289, + "learning_rate": 7.405929722454026e-06, + "loss": 1.2226331233978271, + "refine_loss": 0.0, + "step": 360 + }, + { + "ce_loss": 0.499347448348999, + "epoch": 0.13344422881434248, + "grad_norm": 11.037912368774414, + "learning_rate": 7.391721363765664e-06, + "loss": 1.997389793395996, + "refine_loss": 0.0, + "step": 361 + }, + { + "ce_loss": 0.4362260103225708, + "epoch": 0.1338138804177063, + "grad_norm": 11.615614891052246, + "learning_rate": 7.3774879170519386e-06, + "loss": 1.7449040412902832, + "refine_loss": 0.0, + "step": 362 + }, + { + "ce_loss": 0.516998291015625, + "epoch": 0.13418353202107014, + "grad_norm": 10.43577766418457, + "learning_rate": 7.363229531614973e-06, + "loss": 2.0679931640625, + "refine_loss": 0.0, + "step": 363 + }, + { + "ce_loss": 0.48120182752609253, + "epoch": 0.13455318362443397, + "grad_norm": 8.659377098083496, + "learning_rate": 7.348946357018479e-06, + "loss": 1.9248073101043701, + "refine_loss": 0.0, + "step": 364 + }, + { + "ce_loss": 0.3751688599586487, + "epoch": 0.1349228352277978, + "grad_norm": 10.187156677246094, + "learning_rate": 7.334638543086203e-06, + "loss": 1.5006754398345947, + "refine_loss": 0.0, + "step": 365 + }, + { + "ce_loss": 0.4178575575351715, + "epoch": 0.13529248683116163, + "grad_norm": 11.816719055175781, + "learning_rate": 7.320306239900343e-06, + "loss": 1.671430230140686, + "refine_loss": 0.0, + "step": 366 + }, + { + "ce_loss": 0.5968947410583496, + "epoch": 0.13566213843452546, + "grad_norm": 10.521284103393555, + "learning_rate": 7.305949597799976e-06, + "loss": 2.3875789642333984, + "refine_loss": 0.0, + "step": 367 + }, + { + "ce_loss": 0.39086437225341797, + "epoch": 0.1360317900378893, + "grad_norm": 11.323673248291016, + "learning_rate": 7.291568767379484e-06, + "loss": 1.5634574890136719, + "refine_loss": 0.0, + "step": 368 + }, + { + "ce_loss": 0.5866038799285889, + "epoch": 0.13640144164125312, + "grad_norm": 9.214038848876953, + "learning_rate": 7.277163899486975e-06, + "loss": 2.3464155197143555, + "refine_loss": 0.0, + "step": 369 + }, + { + "ce_loss": 0.388097882270813, + "epoch": 0.13677109324461695, + "grad_norm": 11.912174224853516, + "learning_rate": 7.262735145222696e-06, + "loss": 1.552391529083252, + "refine_loss": 0.0, + "step": 370 + }, + { + "ce_loss": 0.4848365783691406, + "epoch": 0.13714074484798078, + "grad_norm": 9.618626594543457, + "learning_rate": 7.248282655937451e-06, + "loss": 1.9393463134765625, + "refine_loss": 0.0, + "step": 371 + }, + { + "ce_loss": 0.4850764274597168, + "epoch": 0.1375103964513446, + "grad_norm": 10.840434074401855, + "learning_rate": 7.233806583231012e-06, + "loss": 1.9403057098388672, + "refine_loss": 0.0, + "step": 372 + }, + { + "ce_loss": 0.6476688385009766, + "epoch": 0.13788004805470844, + "grad_norm": 10.49376392364502, + "learning_rate": 7.219307078950536e-06, + "loss": 2.5906753540039062, + "refine_loss": 0.0, + "step": 373 + }, + { + "ce_loss": 0.4587974548339844, + "epoch": 0.13824969965807227, + "grad_norm": 9.081794738769531, + "learning_rate": 7.204784295188959e-06, + "loss": 1.8351898193359375, + "refine_loss": 0.0, + "step": 374 + }, + { + "ce_loss": 0.3930548429489136, + "epoch": 0.1386193512614361, + "grad_norm": 8.18437385559082, + "learning_rate": 7.190238384283413e-06, + "loss": 1.5722193717956543, + "refine_loss": 0.0, + "step": 375 + }, + { + "ce_loss": 0.6110433340072632, + "epoch": 0.13898900286479993, + "grad_norm": 10.602723121643066, + "learning_rate": 7.1756694988136165e-06, + "loss": 2.4441733360290527, + "refine_loss": 0.0, + "step": 376 + }, + { + "ce_loss": 0.4912828207015991, + "epoch": 0.13935865446816376, + "grad_norm": 10.058487892150879, + "learning_rate": 7.161077791600288e-06, + "loss": 1.9651312828063965, + "refine_loss": 0.0, + "step": 377 + }, + { + "ce_loss": 0.2811049222946167, + "epoch": 0.13972830607152759, + "grad_norm": 6.920490741729736, + "learning_rate": 7.14646341570353e-06, + "loss": 1.1244196891784668, + "refine_loss": 0.0, + "step": 378 + }, + { + "ce_loss": 0.5073210000991821, + "epoch": 0.14009795767489142, + "grad_norm": 8.2274169921875, + "learning_rate": 7.1318265244212305e-06, + "loss": 2.0292840003967285, + "refine_loss": 0.0, + "step": 379 + }, + { + "ce_loss": 0.4377254843711853, + "epoch": 0.14046760927825525, + "grad_norm": 8.894360542297363, + "learning_rate": 7.117167271287453e-06, + "loss": 1.7509019374847412, + "refine_loss": 0.0, + "step": 380 + }, + { + "ce_loss": 0.41256898641586304, + "epoch": 0.14083726088161908, + "grad_norm": 11.11082649230957, + "learning_rate": 7.102485810070824e-06, + "loss": 1.6502759456634521, + "refine_loss": 0.0, + "step": 381 + }, + { + "ce_loss": 0.4415299892425537, + "epoch": 0.1412069124849829, + "grad_norm": 10.413506507873535, + "learning_rate": 7.0877822947729265e-06, + "loss": 1.7661199569702148, + "refine_loss": 0.0, + "step": 382 + }, + { + "ce_loss": 0.5274656414985657, + "epoch": 0.14157656408834673, + "grad_norm": 8.005457878112793, + "learning_rate": 7.073056879626681e-06, + "loss": 2.1098625659942627, + "refine_loss": 0.0, + "step": 383 + }, + { + "ce_loss": 0.4293609857559204, + "epoch": 0.14194621569171056, + "grad_norm": 26.477588653564453, + "learning_rate": 7.05830971909472e-06, + "loss": 1.7174439430236816, + "refine_loss": 0.0, + "step": 384 + }, + { + "ce_loss": 0.5263055562973022, + "epoch": 0.1423158672950744, + "grad_norm": 9.726752281188965, + "learning_rate": 7.043540967867782e-06, + "loss": 2.105222225189209, + "refine_loss": 0.0, + "step": 385 + }, + { + "ce_loss": 0.4691835045814514, + "epoch": 0.14268551889843822, + "grad_norm": 10.052578926086426, + "learning_rate": 7.028750780863078e-06, + "loss": 1.8767340183258057, + "refine_loss": 0.0, + "step": 386 + }, + { + "ce_loss": 0.4462701082229614, + "epoch": 0.14305517050180205, + "grad_norm": 12.45573616027832, + "learning_rate": 7.013939313222669e-06, + "loss": 1.7850804328918457, + "refine_loss": 0.0, + "step": 387 + }, + { + "ce_loss": 0.5242712497711182, + "epoch": 0.14342482210516588, + "grad_norm": 8.602133750915527, + "learning_rate": 6.999106720311846e-06, + "loss": 2.0970849990844727, + "refine_loss": 0.0, + "step": 388 + }, + { + "ce_loss": 0.4153413772583008, + "epoch": 0.1437944737085297, + "grad_norm": 9.957794189453125, + "learning_rate": 6.9842531577174865e-06, + "loss": 1.6613655090332031, + "refine_loss": 0.0, + "step": 389 + }, + { + "ce_loss": 0.512686014175415, + "epoch": 0.14416412531189354, + "grad_norm": 9.969054222106934, + "learning_rate": 6.969378781246436e-06, + "loss": 2.05074405670166, + "refine_loss": 0.0, + "step": 390 + }, + { + "ce_loss": 0.45514824986457825, + "epoch": 0.14453377691525737, + "grad_norm": 8.452573776245117, + "learning_rate": 6.954483746923865e-06, + "loss": 1.820592999458313, + "refine_loss": 0.0, + "step": 391 + }, + { + "ce_loss": 0.5376133918762207, + "epoch": 0.1449034285186212, + "grad_norm": 9.228282928466797, + "learning_rate": 6.939568210991633e-06, + "loss": 2.150453567504883, + "refine_loss": 0.0, + "step": 392 + }, + { + "ce_loss": 0.41036760807037354, + "epoch": 0.14527308012198503, + "grad_norm": 9.939075469970703, + "learning_rate": 6.924632329906657e-06, + "loss": 1.6414704322814941, + "refine_loss": 0.0, + "step": 393 + }, + { + "ce_loss": 0.47531676292419434, + "epoch": 0.14564273172534886, + "grad_norm": 9.014841079711914, + "learning_rate": 6.9096762603392595e-06, + "loss": 1.9012670516967773, + "refine_loss": 0.0, + "step": 394 + }, + { + "ce_loss": 0.43586719036102295, + "epoch": 0.1460123833287127, + "grad_norm": 7.974857807159424, + "learning_rate": 6.894700159171535e-06, + "loss": 1.7434687614440918, + "refine_loss": 0.0, + "step": 395 + }, + { + "ce_loss": 0.5890612602233887, + "epoch": 0.14638203493207652, + "grad_norm": 10.490070343017578, + "learning_rate": 6.8797041834956955e-06, + "loss": 2.3562450408935547, + "refine_loss": 0.0, + "step": 396 + }, + { + "ce_loss": 0.4850795269012451, + "epoch": 0.14675168653544035, + "grad_norm": 9.473137855529785, + "learning_rate": 6.8646884906124345e-06, + "loss": 1.9403181076049805, + "refine_loss": 0.0, + "step": 397 + }, + { + "ce_loss": 0.5043740272521973, + "epoch": 0.14712133813880418, + "grad_norm": 7.250847339630127, + "learning_rate": 6.849653238029261e-06, + "loss": 2.017496109008789, + "refine_loss": 0.0, + "step": 398 + }, + { + "ce_loss": 0.47040796279907227, + "epoch": 0.147490989742168, + "grad_norm": 8.538890838623047, + "learning_rate": 6.834598583458862e-06, + "loss": 1.881631851196289, + "refine_loss": 0.0, + "step": 399 + }, + { + "ce_loss": 0.45733821392059326, + "epoch": 0.14786064134553184, + "grad_norm": 8.203897476196289, + "learning_rate": 6.819524684817439e-06, + "loss": 1.829352855682373, + "refine_loss": 0.0, + "step": 400 + }, + { + "ce_loss": 0.34762871265411377, + "epoch": 0.14823029294889567, + "grad_norm": 6.719568252563477, + "learning_rate": 6.804431700223057e-06, + "loss": 1.390514850616455, + "refine_loss": 0.0, + "step": 401 + }, + { + "ce_loss": 0.39800161123275757, + "epoch": 0.1485999445522595, + "grad_norm": 7.814823150634766, + "learning_rate": 6.78931978799398e-06, + "loss": 1.5920064449310303, + "refine_loss": 0.0, + "step": 402 + }, + { + "ce_loss": 0.48913097381591797, + "epoch": 0.14896959615562333, + "grad_norm": 9.048940658569336, + "learning_rate": 6.774189106647021e-06, + "loss": 1.9565238952636719, + "refine_loss": 0.0, + "step": 403 + }, + { + "ce_loss": 0.571692705154419, + "epoch": 0.14933924775898716, + "grad_norm": 10.357328414916992, + "learning_rate": 6.7590398148958625e-06, + "loss": 2.286770820617676, + "refine_loss": 0.0, + "step": 404 + }, + { + "ce_loss": 0.5078752040863037, + "epoch": 0.149708899362351, + "grad_norm": 8.636435508728027, + "learning_rate": 6.743872071649411e-06, + "loss": 2.031500816345215, + "refine_loss": 0.0, + "step": 405 + }, + { + "ce_loss": 0.5608711838722229, + "epoch": 0.15007855096571482, + "grad_norm": 8.994526863098145, + "learning_rate": 6.728686036010115e-06, + "loss": 2.2434847354888916, + "refine_loss": 0.0, + "step": 406 + }, + { + "ce_loss": 0.5935931205749512, + "epoch": 0.15044820256907865, + "grad_norm": 9.671996116638184, + "learning_rate": 6.7134818672723005e-06, + "loss": 2.3743724822998047, + "refine_loss": 0.0, + "step": 407 + }, + { + "ce_loss": 0.4106858968734741, + "epoch": 0.15081785417244248, + "grad_norm": 9.027938842773438, + "learning_rate": 6.698259724920503e-06, + "loss": 1.6427435874938965, + "refine_loss": 0.0, + "step": 408 + }, + { + "ce_loss": 0.6180305480957031, + "epoch": 0.1511875057758063, + "grad_norm": 9.486806869506836, + "learning_rate": 6.6830197686277945e-06, + "loss": 2.4721221923828125, + "refine_loss": 0.0, + "step": 409 + }, + { + "ce_loss": 0.4542107582092285, + "epoch": 0.15155715737917014, + "grad_norm": 8.61961555480957, + "learning_rate": 6.667762158254104e-06, + "loss": 1.816843032836914, + "refine_loss": 0.0, + "step": 410 + }, + { + "ce_loss": 0.5169097185134888, + "epoch": 0.15192680898253397, + "grad_norm": 8.94442081451416, + "learning_rate": 6.652487053844544e-06, + "loss": 2.067638874053955, + "refine_loss": 0.0, + "step": 411 + }, + { + "ce_loss": 0.3925154209136963, + "epoch": 0.1522964605858978, + "grad_norm": 8.269384384155273, + "learning_rate": 6.637194615627733e-06, + "loss": 1.5700616836547852, + "refine_loss": 0.0, + "step": 412 + }, + { + "ce_loss": 0.4043484926223755, + "epoch": 0.15266611218926163, + "grad_norm": 7.826936721801758, + "learning_rate": 6.621885004014113e-06, + "loss": 1.617393970489502, + "refine_loss": 0.0, + "step": 413 + }, + { + "ce_loss": 0.5480577945709229, + "epoch": 0.15303576379262546, + "grad_norm": 10.158780097961426, + "learning_rate": 6.6065583795942625e-06, + "loss": 2.1922311782836914, + "refine_loss": 0.0, + "step": 414 + }, + { + "ce_loss": 0.5260552167892456, + "epoch": 0.1534054153959893, + "grad_norm": 8.689077377319336, + "learning_rate": 6.591214903137221e-06, + "loss": 2.1042208671569824, + "refine_loss": 0.0, + "step": 415 + }, + { + "ce_loss": 0.41028231382369995, + "epoch": 0.15377506699935312, + "grad_norm": 7.898672580718994, + "learning_rate": 6.5758547355887944e-06, + "loss": 1.6411292552947998, + "refine_loss": 0.0, + "step": 416 + }, + { + "ce_loss": 0.41179895401000977, + "epoch": 0.15414471860271695, + "grad_norm": 9.759132385253906, + "learning_rate": 6.560478038069873e-06, + "loss": 1.647195816040039, + "refine_loss": 0.0, + "step": 417 + }, + { + "ce_loss": 0.448836088180542, + "epoch": 0.15451437020608078, + "grad_norm": 12.88847827911377, + "learning_rate": 6.545084971874738e-06, + "loss": 1.795344352722168, + "refine_loss": 0.0, + "step": 418 + }, + { + "ce_loss": 0.46027958393096924, + "epoch": 0.1548840218094446, + "grad_norm": 8.673314094543457, + "learning_rate": 6.52967569846937e-06, + "loss": 1.841118335723877, + "refine_loss": 0.0, + "step": 419 + }, + { + "ce_loss": 0.4846668243408203, + "epoch": 0.15525367341280844, + "grad_norm": 14.146642684936523, + "learning_rate": 6.514250379489754e-06, + "loss": 1.9386672973632812, + "refine_loss": 0.0, + "step": 420 + }, + { + "ce_loss": 0.34477412700653076, + "epoch": 0.15562332501617226, + "grad_norm": 10.503873825073242, + "learning_rate": 6.49880917674019e-06, + "loss": 1.379096508026123, + "refine_loss": 0.0, + "step": 421 + }, + { + "ce_loss": 0.32955145835876465, + "epoch": 0.1559929766195361, + "grad_norm": 7.303184986114502, + "learning_rate": 6.483352252191585e-06, + "loss": 1.3182058334350586, + "refine_loss": 0.0, + "step": 422 + }, + { + "ce_loss": 0.5053341388702393, + "epoch": 0.15636262822289992, + "grad_norm": 7.319607734680176, + "learning_rate": 6.467879767979764e-06, + "loss": 2.021336555480957, + "refine_loss": 0.0, + "step": 423 + }, + { + "ce_loss": 0.4482458829879761, + "epoch": 0.15673227982626375, + "grad_norm": 8.976991653442383, + "learning_rate": 6.452391886403767e-06, + "loss": 1.7929835319519043, + "refine_loss": 0.0, + "step": 424 + }, + { + "ce_loss": 0.4032573103904724, + "epoch": 0.15710193142962758, + "grad_norm": 16.490955352783203, + "learning_rate": 6.436888769924142e-06, + "loss": 1.6130292415618896, + "refine_loss": 0.0, + "step": 425 + }, + { + "ce_loss": 0.48328423500061035, + "epoch": 0.1574715830329914, + "grad_norm": 8.864872932434082, + "learning_rate": 6.421370581161244e-06, + "loss": 1.9331369400024414, + "refine_loss": 0.0, + "step": 426 + }, + { + "ce_loss": 0.5267331600189209, + "epoch": 0.15784123463635524, + "grad_norm": 11.598426818847656, + "learning_rate": 6.405837482893529e-06, + "loss": 2.1069326400756836, + "refine_loss": 0.0, + "step": 427 + }, + { + "ce_loss": 0.3422888517379761, + "epoch": 0.15821088623971907, + "grad_norm": 8.846256256103516, + "learning_rate": 6.390289638055851e-06, + "loss": 1.3691554069519043, + "refine_loss": 0.0, + "step": 428 + }, + { + "ce_loss": 0.474323034286499, + "epoch": 0.1585805378430829, + "grad_norm": 9.152615547180176, + "learning_rate": 6.374727209737743e-06, + "loss": 1.897292137145996, + "refine_loss": 0.0, + "step": 429 + }, + { + "ce_loss": 0.414692759513855, + "epoch": 0.15895018944644673, + "grad_norm": 6.10373067855835, + "learning_rate": 6.3591503611817155e-06, + "loss": 1.65877103805542, + "refine_loss": 0.0, + "step": 430 + }, + { + "ce_loss": 0.46967148780822754, + "epoch": 0.15931984104981056, + "grad_norm": 9.630118370056152, + "learning_rate": 6.343559255781538e-06, + "loss": 1.8786859512329102, + "refine_loss": 0.0, + "step": 431 + }, + { + "ce_loss": 0.37067699432373047, + "epoch": 0.1596894926531744, + "grad_norm": 7.606201171875, + "learning_rate": 6.3279540570805265e-06, + "loss": 1.4827079772949219, + "refine_loss": 0.0, + "step": 432 + }, + { + "ce_loss": 0.46851563453674316, + "epoch": 0.16005914425653822, + "grad_norm": 9.33636474609375, + "learning_rate": 6.3123349287698345e-06, + "loss": 1.8740625381469727, + "refine_loss": 0.0, + "step": 433 + }, + { + "ce_loss": 0.41760313510894775, + "epoch": 0.16042879585990205, + "grad_norm": 9.52145767211914, + "learning_rate": 6.296702034686726e-06, + "loss": 1.670412540435791, + "refine_loss": 0.0, + "step": 434 + }, + { + "ce_loss": 0.5646244287490845, + "epoch": 0.16079844746326588, + "grad_norm": 14.739233016967773, + "learning_rate": 6.281055538812861e-06, + "loss": 2.258497714996338, + "refine_loss": 0.0, + "step": 435 + }, + { + "ce_loss": 0.5646448135375977, + "epoch": 0.1611680990666297, + "grad_norm": 10.327033996582031, + "learning_rate": 6.265395605272581e-06, + "loss": 2.2585792541503906, + "refine_loss": 0.0, + "step": 436 + }, + { + "ce_loss": 0.5255656242370605, + "epoch": 0.16153775066999354, + "grad_norm": 11.02020263671875, + "learning_rate": 6.249722398331177e-06, + "loss": 2.102262496948242, + "refine_loss": 0.0, + "step": 437 + }, + { + "ce_loss": 0.2849227786064148, + "epoch": 0.16190740227335737, + "grad_norm": 8.077251434326172, + "learning_rate": 6.234036082393171e-06, + "loss": 1.1396911144256592, + "refine_loss": 0.0, + "step": 438 + }, + { + "ce_loss": 0.4458034038543701, + "epoch": 0.1622770538767212, + "grad_norm": 7.966236591339111, + "learning_rate": 6.218336822000598e-06, + "loss": 1.7832136154174805, + "refine_loss": 0.0, + "step": 439 + }, + { + "ce_loss": 0.4162788391113281, + "epoch": 0.16264670548008503, + "grad_norm": 9.13566780090332, + "learning_rate": 6.202624781831269e-06, + "loss": 1.6651153564453125, + "refine_loss": 0.0, + "step": 440 + }, + { + "ce_loss": 0.4881707429885864, + "epoch": 0.16301635708344886, + "grad_norm": 9.531209945678711, + "learning_rate": 6.18690012669705e-06, + "loss": 1.9526829719543457, + "refine_loss": 0.0, + "step": 441 + }, + { + "ce_loss": 0.46038389205932617, + "epoch": 0.1633860086868127, + "grad_norm": 8.59526252746582, + "learning_rate": 6.171163021542134e-06, + "loss": 1.8415355682373047, + "refine_loss": 0.0, + "step": 442 + }, + { + "ce_loss": 0.48354434967041016, + "epoch": 0.16375566029017652, + "grad_norm": 9.290780067443848, + "learning_rate": 6.155413631441307e-06, + "loss": 1.9341773986816406, + "refine_loss": 0.0, + "step": 443 + }, + { + "ce_loss": 0.5096735954284668, + "epoch": 0.16412531189354035, + "grad_norm": 8.205672264099121, + "learning_rate": 6.139652121598219e-06, + "loss": 2.038694381713867, + "refine_loss": 0.0, + "step": 444 + }, + { + "ce_loss": 0.48086535930633545, + "epoch": 0.16449496349690418, + "grad_norm": 9.108717918395996, + "learning_rate": 6.123878657343648e-06, + "loss": 1.9234614372253418, + "refine_loss": 0.0, + "step": 445 + }, + { + "ce_loss": 0.37310242652893066, + "epoch": 0.164864615100268, + "grad_norm": 8.828269958496094, + "learning_rate": 6.108093404133772e-06, + "loss": 1.4924097061157227, + "refine_loss": 0.0, + "step": 446 + }, + { + "ce_loss": 0.3573284149169922, + "epoch": 0.16523426670363184, + "grad_norm": 8.475523948669434, + "learning_rate": 6.092296527548427e-06, + "loss": 1.4293136596679688, + "refine_loss": 0.0, + "step": 447 + }, + { + "ce_loss": 0.49456751346588135, + "epoch": 0.16560391830699567, + "grad_norm": 9.552000999450684, + "learning_rate": 6.076488193289375e-06, + "loss": 1.9782700538635254, + "refine_loss": 0.0, + "step": 448 + }, + { + "ce_loss": 0.573183536529541, + "epoch": 0.1659735699103595, + "grad_norm": 9.836374282836914, + "learning_rate": 6.060668567178561e-06, + "loss": 2.292734146118164, + "refine_loss": 0.0, + "step": 449 + }, + { + "ce_loss": 0.4885174632072449, + "epoch": 0.16634322151372333, + "grad_norm": 9.59506893157959, + "learning_rate": 6.044837815156377e-06, + "loss": 1.9540698528289795, + "refine_loss": 0.0, + "step": 450 + }, + { + "ce_loss": 0.39499950408935547, + "epoch": 0.16671287311708716, + "grad_norm": 8.713544845581055, + "learning_rate": 6.028996103279918e-06, + "loss": 1.5799980163574219, + "refine_loss": 0.0, + "step": 451 + }, + { + "ce_loss": 0.40126872062683105, + "epoch": 0.167082524720451, + "grad_norm": 7.154967784881592, + "learning_rate": 6.013143597721252e-06, + "loss": 1.6050748825073242, + "refine_loss": 0.0, + "step": 452 + }, + { + "ce_loss": 0.48070669174194336, + "epoch": 0.16745217632381482, + "grad_norm": 9.758798599243164, + "learning_rate": 5.997280464765655e-06, + "loss": 1.9228267669677734, + "refine_loss": 0.0, + "step": 453 + }, + { + "ce_loss": 0.3813167214393616, + "epoch": 0.16782182792717865, + "grad_norm": 7.322877407073975, + "learning_rate": 5.981406870809889e-06, + "loss": 1.5252668857574463, + "refine_loss": 0.0, + "step": 454 + }, + { + "ce_loss": 0.47611117362976074, + "epoch": 0.16819147953054248, + "grad_norm": 9.625687599182129, + "learning_rate": 5.965522982360441e-06, + "loss": 1.904444694519043, + "refine_loss": 0.0, + "step": 455 + }, + { + "ce_loss": 0.5136280059814453, + "epoch": 0.1685611311339063, + "grad_norm": 8.267143249511719, + "learning_rate": 5.949628966031785e-06, + "loss": 2.0545120239257812, + "refine_loss": 0.0, + "step": 456 + }, + { + "ce_loss": 0.5073539018630981, + "epoch": 0.16893078273727014, + "grad_norm": 7.593673229217529, + "learning_rate": 5.933724988544632e-06, + "loss": 2.0294156074523926, + "refine_loss": 0.0, + "step": 457 + }, + { + "ce_loss": 0.3930559754371643, + "epoch": 0.16930043434063397, + "grad_norm": 8.188336372375488, + "learning_rate": 5.9178112167241805e-06, + "loss": 1.5722239017486572, + "refine_loss": 0.0, + "step": 458 + }, + { + "ce_loss": 0.4887848496437073, + "epoch": 0.1696700859439978, + "grad_norm": 7.380640983581543, + "learning_rate": 5.9018878174983674e-06, + "loss": 1.955139398574829, + "refine_loss": 0.0, + "step": 459 + }, + { + "ce_loss": 0.37317216396331787, + "epoch": 0.17003973754736162, + "grad_norm": 7.1016411781311035, + "learning_rate": 5.885954957896115e-06, + "loss": 1.4926886558532715, + "refine_loss": 0.0, + "step": 460 + }, + { + "ce_loss": 0.495633602142334, + "epoch": 0.17040938915072545, + "grad_norm": 8.00838851928711, + "learning_rate": 5.87001280504558e-06, + "loss": 1.982534408569336, + "refine_loss": 0.0, + "step": 461 + }, + { + "ce_loss": 0.27469444274902344, + "epoch": 0.17077904075408928, + "grad_norm": 9.424318313598633, + "learning_rate": 5.854061526172402e-06, + "loss": 1.0987777709960938, + "refine_loss": 0.0, + "step": 462 + }, + { + "ce_loss": 0.4084160327911377, + "epoch": 0.1711486923574531, + "grad_norm": 8.507837295532227, + "learning_rate": 5.838101288597951e-06, + "loss": 1.6336641311645508, + "refine_loss": 0.0, + "step": 463 + }, + { + "ce_loss": 0.3457944393157959, + "epoch": 0.17151834396081694, + "grad_norm": 8.290342330932617, + "learning_rate": 5.822132259737565e-06, + "loss": 1.3831777572631836, + "refine_loss": 0.0, + "step": 464 + }, + { + "ce_loss": 0.45686161518096924, + "epoch": 0.17188799556418077, + "grad_norm": 8.378122329711914, + "learning_rate": 5.806154607098799e-06, + "loss": 1.827446460723877, + "refine_loss": 0.0, + "step": 465 + }, + { + "ce_loss": 0.3632063865661621, + "epoch": 0.1722576471675446, + "grad_norm": 9.759676933288574, + "learning_rate": 5.7901684982796716e-06, + "loss": 1.4528255462646484, + "refine_loss": 0.0, + "step": 466 + }, + { + "ce_loss": 0.48107755184173584, + "epoch": 0.1726272987709084, + "grad_norm": 14.348111152648926, + "learning_rate": 5.774174100966899e-06, + "loss": 1.9243102073669434, + "refine_loss": 0.0, + "step": 467 + }, + { + "ce_loss": 0.3647770881652832, + "epoch": 0.17299695037427223, + "grad_norm": 8.199952125549316, + "learning_rate": 5.75817158293414e-06, + "loss": 1.4591083526611328, + "refine_loss": 0.0, + "step": 468 + }, + { + "ce_loss": 0.3100966215133667, + "epoch": 0.17336660197763606, + "grad_norm": 12.82120132446289, + "learning_rate": 5.742161112040237e-06, + "loss": 1.2403864860534668, + "refine_loss": 0.0, + "step": 469 + }, + { + "ce_loss": 0.40797120332717896, + "epoch": 0.1737362535809999, + "grad_norm": 9.752256393432617, + "learning_rate": 5.726142856227453e-06, + "loss": 1.6318848133087158, + "refine_loss": 0.0, + "step": 470 + }, + { + "ce_loss": 0.4822409749031067, + "epoch": 0.17410590518436372, + "grad_norm": 9.626302719116211, + "learning_rate": 5.7101169835197115e-06, + "loss": 1.9289638996124268, + "refine_loss": 0.0, + "step": 471 + }, + { + "ce_loss": 0.4746251106262207, + "epoch": 0.17447555678772755, + "grad_norm": 9.977282524108887, + "learning_rate": 5.694083662020835e-06, + "loss": 1.8985004425048828, + "refine_loss": 0.0, + "step": 472 + }, + { + "ce_loss": 0.5701179504394531, + "epoch": 0.17484520839109138, + "grad_norm": 8.945584297180176, + "learning_rate": 5.678043059912776e-06, + "loss": 2.2804718017578125, + "refine_loss": 0.0, + "step": 473 + }, + { + "ce_loss": 0.3851062059402466, + "epoch": 0.1752148599944552, + "grad_norm": 9.323137283325195, + "learning_rate": 5.661995345453867e-06, + "loss": 1.5404248237609863, + "refine_loss": 0.0, + "step": 474 + }, + { + "ce_loss": 0.43563711643218994, + "epoch": 0.17558451159781904, + "grad_norm": 12.931821823120117, + "learning_rate": 5.645940686977033e-06, + "loss": 1.7425484657287598, + "refine_loss": 0.0, + "step": 475 + }, + { + "ce_loss": 0.4737333059310913, + "epoch": 0.17595416320118287, + "grad_norm": 8.269094467163086, + "learning_rate": 5.629879252888046e-06, + "loss": 1.8949332237243652, + "refine_loss": 0.0, + "step": 476 + }, + { + "ce_loss": 0.4387834072113037, + "epoch": 0.1763238148045467, + "grad_norm": 7.851072788238525, + "learning_rate": 5.613811211663751e-06, + "loss": 1.7551336288452148, + "refine_loss": 0.0, + "step": 477 + }, + { + "ce_loss": 0.47638988494873047, + "epoch": 0.17669346640791053, + "grad_norm": 9.417006492614746, + "learning_rate": 5.597736731850295e-06, + "loss": 1.9055595397949219, + "refine_loss": 0.0, + "step": 478 + }, + { + "ce_loss": 0.4597822427749634, + "epoch": 0.17706311801127436, + "grad_norm": 7.66142463684082, + "learning_rate": 5.581655982061367e-06, + "loss": 1.8391289710998535, + "refine_loss": 0.0, + "step": 479 + }, + { + "ce_loss": 0.47592639923095703, + "epoch": 0.1774327696146382, + "grad_norm": 9.574882507324219, + "learning_rate": 5.5655691309764225e-06, + "loss": 1.9037055969238281, + "refine_loss": 0.0, + "step": 480 + }, + { + "ce_loss": 0.3618781566619873, + "epoch": 0.17780242121800202, + "grad_norm": 8.154146194458008, + "learning_rate": 5.549476347338915e-06, + "loss": 1.4475126266479492, + "refine_loss": 0.0, + "step": 481 + }, + { + "ce_loss": 0.42792582511901855, + "epoch": 0.17817207282136585, + "grad_norm": 7.802042484283447, + "learning_rate": 5.533377799954532e-06, + "loss": 1.7117033004760742, + "refine_loss": 0.0, + "step": 482 + }, + { + "ce_loss": 0.4880123734474182, + "epoch": 0.17854172442472968, + "grad_norm": 13.051204681396484, + "learning_rate": 5.517273657689419e-06, + "loss": 1.9520494937896729, + "refine_loss": 0.0, + "step": 483 + }, + { + "ce_loss": 0.49871787428855896, + "epoch": 0.1789113760280935, + "grad_norm": 10.877169609069824, + "learning_rate": 5.501164089468406e-06, + "loss": 1.9948714971542358, + "refine_loss": 0.0, + "step": 484 + }, + { + "ce_loss": 0.371906042098999, + "epoch": 0.17928102763145734, + "grad_norm": 8.520963668823242, + "learning_rate": 5.485049264273241e-06, + "loss": 1.487624168395996, + "refine_loss": 0.0, + "step": 485 + }, + { + "ce_loss": 0.609112024307251, + "epoch": 0.17965067923482117, + "grad_norm": 11.965548515319824, + "learning_rate": 5.4689293511408155e-06, + "loss": 2.436448097229004, + "refine_loss": 0.0, + "step": 486 + }, + { + "ce_loss": 0.5074594020843506, + "epoch": 0.180020330838185, + "grad_norm": 10.944469451904297, + "learning_rate": 5.45280451916139e-06, + "loss": 2.0298376083374023, + "refine_loss": 0.0, + "step": 487 + }, + { + "ce_loss": 0.4492247402667999, + "epoch": 0.18038998244154883, + "grad_norm": 9.593073844909668, + "learning_rate": 5.43667493747682e-06, + "loss": 1.7968989610671997, + "refine_loss": 0.0, + "step": 488 + }, + { + "ce_loss": 0.4534952640533447, + "epoch": 0.18075963404491266, + "grad_norm": 8.449090957641602, + "learning_rate": 5.4205407752787884e-06, + "loss": 1.813981056213379, + "refine_loss": 0.0, + "step": 489 + }, + { + "ce_loss": 0.4622027575969696, + "epoch": 0.1811292856482765, + "grad_norm": 10.44463062286377, + "learning_rate": 5.404402201807022e-06, + "loss": 1.8488110303878784, + "refine_loss": 0.0, + "step": 490 + }, + { + "ce_loss": 0.4628638029098511, + "epoch": 0.18149893725164032, + "grad_norm": 13.345504760742188, + "learning_rate": 5.388259386347518e-06, + "loss": 1.8514552116394043, + "refine_loss": 0.0, + "step": 491 + }, + { + "ce_loss": 0.33748340606689453, + "epoch": 0.18186858885500415, + "grad_norm": 8.433563232421875, + "learning_rate": 5.372112498230771e-06, + "loss": 1.3499336242675781, + "refine_loss": 0.0, + "step": 492 + }, + { + "ce_loss": 0.4906718134880066, + "epoch": 0.18223824045836798, + "grad_norm": 13.208283424377441, + "learning_rate": 5.355961706829997e-06, + "loss": 1.9626872539520264, + "refine_loss": 0.0, + "step": 493 + }, + { + "ce_loss": 0.4528672695159912, + "epoch": 0.1826078920617318, + "grad_norm": 8.552472114562988, + "learning_rate": 5.339807181559359e-06, + "loss": 1.8114690780639648, + "refine_loss": 0.0, + "step": 494 + }, + { + "ce_loss": 0.5014204978942871, + "epoch": 0.18297754366509564, + "grad_norm": 10.789557456970215, + "learning_rate": 5.323649091872179e-06, + "loss": 2.0056819915771484, + "refine_loss": 0.0, + "step": 495 + }, + { + "ce_loss": 0.6323814392089844, + "epoch": 0.18334719526845947, + "grad_norm": 11.628595352172852, + "learning_rate": 5.307487607259175e-06, + "loss": 2.5295257568359375, + "refine_loss": 0.0, + "step": 496 + }, + { + "ce_loss": 0.544074535369873, + "epoch": 0.1837168468718233, + "grad_norm": 11.151312828063965, + "learning_rate": 5.291322897246669e-06, + "loss": 2.176298141479492, + "refine_loss": 0.0, + "step": 497 + }, + { + "ce_loss": 0.4879574775695801, + "epoch": 0.18408649847518713, + "grad_norm": 8.91184139251709, + "learning_rate": 5.275155131394825e-06, + "loss": 1.9518299102783203, + "refine_loss": 0.0, + "step": 498 + }, + { + "ce_loss": 0.439935564994812, + "epoch": 0.18445615007855096, + "grad_norm": 7.986157417297363, + "learning_rate": 5.258984479295853e-06, + "loss": 1.759742259979248, + "refine_loss": 0.0, + "step": 499 + }, + { + "ce_loss": 0.35772401094436646, + "epoch": 0.1848258016819148, + "grad_norm": 11.187300682067871, + "learning_rate": 5.242811110572243e-06, + "loss": 1.4308960437774658, + "refine_loss": 0.0, + "step": 500 + }, + { + "ce_loss": 0.5437698364257812, + "epoch": 0.18519545328527862, + "grad_norm": 10.264204978942871, + "learning_rate": 5.226635194874978e-06, + "loss": 2.175079345703125, + "refine_loss": 0.0, + "step": 501 + }, + { + "ce_loss": 0.483748197555542, + "epoch": 0.18556510488864245, + "grad_norm": 11.65782642364502, + "learning_rate": 5.210456901881761e-06, + "loss": 1.934992790222168, + "refine_loss": 0.0, + "step": 502 + }, + { + "ce_loss": 0.4829123020172119, + "epoch": 0.18593475649200628, + "grad_norm": 8.923626899719238, + "learning_rate": 5.194276401295231e-06, + "loss": 1.9316492080688477, + "refine_loss": 0.0, + "step": 503 + }, + { + "ce_loss": 0.49942851066589355, + "epoch": 0.1863044080953701, + "grad_norm": 9.49170970916748, + "learning_rate": 5.1780938628411795e-06, + "loss": 1.9977140426635742, + "refine_loss": 0.0, + "step": 504 + }, + { + "ce_loss": 0.4504272937774658, + "epoch": 0.18667405969873394, + "grad_norm": 10.648200988769531, + "learning_rate": 5.161909456266781e-06, + "loss": 1.8017091751098633, + "refine_loss": 0.0, + "step": 505 + }, + { + "ce_loss": 0.5583152770996094, + "epoch": 0.18704371130209776, + "grad_norm": 9.489195823669434, + "learning_rate": 5.145723351338799e-06, + "loss": 2.2332611083984375, + "refine_loss": 0.0, + "step": 506 + }, + { + "ce_loss": 0.401568740606308, + "epoch": 0.1874133629054616, + "grad_norm": 8.862239837646484, + "learning_rate": 5.129535717841818e-06, + "loss": 1.606274962425232, + "refine_loss": 0.0, + "step": 507 + }, + { + "ce_loss": 0.5007185935974121, + "epoch": 0.18778301450882542, + "grad_norm": 10.425126075744629, + "learning_rate": 5.11334672557645e-06, + "loss": 2.0028743743896484, + "refine_loss": 0.0, + "step": 508 + }, + { + "ce_loss": 0.3522944450378418, + "epoch": 0.18815266611218925, + "grad_norm": 8.690531730651855, + "learning_rate": 5.097156544357567e-06, + "loss": 1.4091777801513672, + "refine_loss": 0.0, + "step": 509 + }, + { + "ce_loss": 0.4302210807800293, + "epoch": 0.18852231771555308, + "grad_norm": 8.877826690673828, + "learning_rate": 5.080965344012509e-06, + "loss": 1.7208843231201172, + "refine_loss": 0.0, + "step": 510 + }, + { + "ce_loss": 0.46343994140625, + "epoch": 0.1888919693189169, + "grad_norm": 8.290945053100586, + "learning_rate": 5.064773294379302e-06, + "loss": 1.853759765625, + "refine_loss": 0.0, + "step": 511 + }, + { + "ce_loss": 0.39203080534935, + "epoch": 0.18926162092228074, + "grad_norm": 9.424485206604004, + "learning_rate": 5.048580565304887e-06, + "loss": 1.5681232213974, + "refine_loss": 0.0, + "step": 512 + }, + { + "ce_loss": 0.4844989776611328, + "epoch": 0.18963127252564457, + "grad_norm": 10.192035675048828, + "learning_rate": 5.032387326643331e-06, + "loss": 1.9379959106445312, + "refine_loss": 0.0, + "step": 513 + }, + { + "ce_loss": 0.4570794105529785, + "epoch": 0.1900009241290084, + "grad_norm": 9.515115737915039, + "learning_rate": 5.016193748254045e-06, + "loss": 1.828317642211914, + "refine_loss": 0.0, + "step": 514 + }, + { + "ce_loss": 0.5199398994445801, + "epoch": 0.19037057573237223, + "grad_norm": 10.294302940368652, + "learning_rate": 5e-06, + "loss": 2.0797595977783203, + "refine_loss": 0.0, + "step": 515 + }, + { + "ce_loss": 0.5341153144836426, + "epoch": 0.19074022733573606, + "grad_norm": 10.828805923461914, + "learning_rate": 4.983806251745958e-06, + "loss": 2.1364612579345703, + "refine_loss": 0.0, + "step": 516 + }, + { + "ce_loss": 0.25566816329956055, + "epoch": 0.1911098789390999, + "grad_norm": 12.14079761505127, + "learning_rate": 4.9676126733566705e-06, + "loss": 1.0226726531982422, + "refine_loss": 0.0, + "step": 517 + }, + { + "ce_loss": 0.3013119697570801, + "epoch": 0.19147953054246372, + "grad_norm": 8.566230773925781, + "learning_rate": 4.951419434695115e-06, + "loss": 1.2052478790283203, + "refine_loss": 0.0, + "step": 518 + }, + { + "ce_loss": 0.5011106729507446, + "epoch": 0.19184918214582755, + "grad_norm": 9.185887336730957, + "learning_rate": 4.935226705620699e-06, + "loss": 2.0044426918029785, + "refine_loss": 0.0, + "step": 519 + }, + { + "ce_loss": 0.36000490188598633, + "epoch": 0.19221883374919138, + "grad_norm": 8.155330657958984, + "learning_rate": 4.919034655987493e-06, + "loss": 1.4400196075439453, + "refine_loss": 0.0, + "step": 520 + }, + { + "ce_loss": 0.38439083099365234, + "epoch": 0.1925884853525552, + "grad_norm": 9.90329647064209, + "learning_rate": 4.9028434556424335e-06, + "loss": 1.5375633239746094, + "refine_loss": 0.0, + "step": 521 + }, + { + "ce_loss": 0.44377803802490234, + "epoch": 0.19295813695591904, + "grad_norm": 9.216550827026367, + "learning_rate": 4.886653274423551e-06, + "loss": 1.7751121520996094, + "refine_loss": 0.0, + "step": 522 + }, + { + "ce_loss": 0.4957590103149414, + "epoch": 0.19332778855928287, + "grad_norm": 10.764482498168945, + "learning_rate": 4.870464282158184e-06, + "loss": 1.9830360412597656, + "refine_loss": 0.0, + "step": 523 + }, + { + "ce_loss": 0.35591447353363037, + "epoch": 0.1936974401626467, + "grad_norm": 9.363845825195312, + "learning_rate": 4.8542766486612035e-06, + "loss": 1.4236578941345215, + "refine_loss": 0.0, + "step": 524 + }, + { + "ce_loss": 0.5641214847564697, + "epoch": 0.19406709176601053, + "grad_norm": 14.866276741027832, + "learning_rate": 4.838090543733222e-06, + "loss": 2.256485939025879, + "refine_loss": 0.0, + "step": 525 + }, + { + "ce_loss": 0.5610926151275635, + "epoch": 0.19443674336937436, + "grad_norm": 9.813082695007324, + "learning_rate": 4.821906137158822e-06, + "loss": 2.244370460510254, + "refine_loss": 0.0, + "step": 526 + }, + { + "ce_loss": 0.32414674758911133, + "epoch": 0.1948063949727382, + "grad_norm": 8.490361213684082, + "learning_rate": 4.805723598704772e-06, + "loss": 1.2965869903564453, + "refine_loss": 0.0, + "step": 527 + }, + { + "ce_loss": 0.4826939105987549, + "epoch": 0.19517604657610202, + "grad_norm": 8.25439453125, + "learning_rate": 4.7895430981182415e-06, + "loss": 1.9307756423950195, + "refine_loss": 0.0, + "step": 528 + }, + { + "ce_loss": 0.3672596216201782, + "epoch": 0.19554569817946585, + "grad_norm": 8.924995422363281, + "learning_rate": 4.773364805125025e-06, + "loss": 1.469038486480713, + "refine_loss": 0.0, + "step": 529 + }, + { + "ce_loss": 0.4726506471633911, + "epoch": 0.19591534978282968, + "grad_norm": 9.414588928222656, + "learning_rate": 4.757188889427761e-06, + "loss": 1.8906025886535645, + "refine_loss": 0.0, + "step": 530 + }, + { + "ce_loss": 0.5611264705657959, + "epoch": 0.1962850013861935, + "grad_norm": 9.468201637268066, + "learning_rate": 4.741015520704148e-06, + "loss": 2.2445058822631836, + "refine_loss": 0.0, + "step": 531 + }, + { + "ce_loss": 0.32427549362182617, + "epoch": 0.19665465298955734, + "grad_norm": 7.357333660125732, + "learning_rate": 4.724844868605176e-06, + "loss": 1.2971019744873047, + "refine_loss": 0.0, + "step": 532 + }, + { + "ce_loss": 0.5099225044250488, + "epoch": 0.19702430459292117, + "grad_norm": 9.045973777770996, + "learning_rate": 4.708677102753331e-06, + "loss": 2.0396900177001953, + "refine_loss": 0.0, + "step": 533 + }, + { + "ce_loss": 0.46609950065612793, + "epoch": 0.197393956196285, + "grad_norm": 9.092626571655273, + "learning_rate": 4.6925123927408265e-06, + "loss": 1.8643980026245117, + "refine_loss": 0.0, + "step": 534 + }, + { + "ce_loss": 0.51667720079422, + "epoch": 0.19776360779964883, + "grad_norm": 11.879148483276367, + "learning_rate": 4.6763509081278215e-06, + "loss": 2.06670880317688, + "refine_loss": 0.0, + "step": 535 + }, + { + "ce_loss": 0.5312578678131104, + "epoch": 0.19813325940301266, + "grad_norm": 10.984776496887207, + "learning_rate": 4.660192818440642e-06, + "loss": 2.1250314712524414, + "refine_loss": 0.0, + "step": 536 + }, + { + "ce_loss": 0.29234206676483154, + "epoch": 0.1985029110063765, + "grad_norm": 7.628251552581787, + "learning_rate": 4.644038293170003e-06, + "loss": 1.1693682670593262, + "refine_loss": 0.0, + "step": 537 + }, + { + "ce_loss": 0.4267864227294922, + "epoch": 0.19887256260974032, + "grad_norm": 9.515523910522461, + "learning_rate": 4.627887501769231e-06, + "loss": 1.7071456909179688, + "refine_loss": 0.0, + "step": 538 + }, + { + "ce_loss": 0.5450105667114258, + "epoch": 0.19924221421310415, + "grad_norm": 10.489255905151367, + "learning_rate": 4.611740613652485e-06, + "loss": 2.180042266845703, + "refine_loss": 0.0, + "step": 539 + }, + { + "ce_loss": 0.5572495460510254, + "epoch": 0.19961186581646798, + "grad_norm": 9.841736793518066, + "learning_rate": 4.59559779819298e-06, + "loss": 2.2289981842041016, + "refine_loss": 0.0, + "step": 540 + }, + { + "ce_loss": 0.3885200023651123, + "epoch": 0.1999815174198318, + "grad_norm": 9.033206939697266, + "learning_rate": 4.579459224721212e-06, + "loss": 1.5540800094604492, + "refine_loss": 0.0, + "step": 541 + }, + { + "ce_loss": 0.47109830379486084, + "epoch": 0.20035116902319564, + "grad_norm": 8.770919799804688, + "learning_rate": 4.5633250625231806e-06, + "loss": 1.8843932151794434, + "refine_loss": 0.0, + "step": 542 + }, + { + "ce_loss": 0.449929416179657, + "epoch": 0.20072082062655947, + "grad_norm": 9.22470760345459, + "learning_rate": 4.547195480838612e-06, + "loss": 1.799717664718628, + "refine_loss": 0.0, + "step": 543 + }, + { + "ce_loss": 0.5060431957244873, + "epoch": 0.2010904722299233, + "grad_norm": 8.583282470703125, + "learning_rate": 4.531070648859186e-06, + "loss": 2.024172782897949, + "refine_loss": 0.0, + "step": 544 + }, + { + "ce_loss": 0.4480346441268921, + "epoch": 0.20146012383328712, + "grad_norm": 11.344508171081543, + "learning_rate": 4.51495073572676e-06, + "loss": 1.7921385765075684, + "refine_loss": 0.0, + "step": 545 + }, + { + "ce_loss": 0.3515781760215759, + "epoch": 0.20182977543665095, + "grad_norm": 9.575823783874512, + "learning_rate": 4.498835910531595e-06, + "loss": 1.4063127040863037, + "refine_loss": 0.0, + "step": 546 + }, + { + "ce_loss": 0.4829676151275635, + "epoch": 0.20219942704001478, + "grad_norm": 9.129422187805176, + "learning_rate": 4.482726342310582e-06, + "loss": 1.931870460510254, + "refine_loss": 0.0, + "step": 547 + }, + { + "ce_loss": 0.50811767578125, + "epoch": 0.20256907864337861, + "grad_norm": 10.241291046142578, + "learning_rate": 4.4666222000454685e-06, + "loss": 2.032470703125, + "refine_loss": 0.0, + "step": 548 + }, + { + "ce_loss": 0.5188241004943848, + "epoch": 0.20293873024674244, + "grad_norm": 9.321981430053711, + "learning_rate": 4.450523652661086e-06, + "loss": 2.075296401977539, + "refine_loss": 0.0, + "step": 549 + }, + { + "ce_loss": 0.5430876016616821, + "epoch": 0.20330838185010627, + "grad_norm": 8.72467041015625, + "learning_rate": 4.434430869023579e-06, + "loss": 2.1723504066467285, + "refine_loss": 0.0, + "step": 550 + }, + { + "ce_loss": 0.47068357467651367, + "epoch": 0.2036780334534701, + "grad_norm": 9.14653205871582, + "learning_rate": 4.418344017938634e-06, + "loss": 1.8827342987060547, + "refine_loss": 0.0, + "step": 551 + }, + { + "ce_loss": 0.37733888626098633, + "epoch": 0.20404768505683393, + "grad_norm": 8.255782127380371, + "learning_rate": 4.402263268149707e-06, + "loss": 1.5093555450439453, + "refine_loss": 0.0, + "step": 552 + }, + { + "ce_loss": 0.5609667301177979, + "epoch": 0.20441733666019776, + "grad_norm": 11.809683799743652, + "learning_rate": 4.386188788336251e-06, + "loss": 2.2438669204711914, + "refine_loss": 0.0, + "step": 553 + }, + { + "ce_loss": 0.44942522048950195, + "epoch": 0.2047869882635616, + "grad_norm": 8.321514129638672, + "learning_rate": 4.370120747111956e-06, + "loss": 1.7977008819580078, + "refine_loss": 0.0, + "step": 554 + }, + { + "ce_loss": 0.4635310173034668, + "epoch": 0.20515663986692542, + "grad_norm": 9.233635902404785, + "learning_rate": 4.3540593130229695e-06, + "loss": 1.8541240692138672, + "refine_loss": 0.0, + "step": 555 + }, + { + "ce_loss": 0.5236377716064453, + "epoch": 0.20552629147028925, + "grad_norm": 10.339839935302734, + "learning_rate": 4.338004654546136e-06, + "loss": 2.0945510864257812, + "refine_loss": 0.0, + "step": 556 + }, + { + "ce_loss": 0.5152699947357178, + "epoch": 0.20589594307365308, + "grad_norm": 10.1702299118042, + "learning_rate": 4.3219569400872244e-06, + "loss": 2.061079978942871, + "refine_loss": 0.0, + "step": 557 + }, + { + "ce_loss": 0.4415321350097656, + "epoch": 0.2062655946770169, + "grad_norm": 7.9304375648498535, + "learning_rate": 4.3059163379791676e-06, + "loss": 1.7661285400390625, + "refine_loss": 0.0, + "step": 558 + }, + { + "ce_loss": 0.4095269441604614, + "epoch": 0.20663524628038074, + "grad_norm": 12.99287223815918, + "learning_rate": 4.289883016480291e-06, + "loss": 1.6381077766418457, + "refine_loss": 0.0, + "step": 559 + }, + { + "ce_loss": 0.45501530170440674, + "epoch": 0.20700489788374457, + "grad_norm": 8.761757850646973, + "learning_rate": 4.27385714377255e-06, + "loss": 1.820061206817627, + "refine_loss": 0.0, + "step": 560 + }, + { + "ce_loss": 0.3384588956832886, + "epoch": 0.2073745494871084, + "grad_norm": 8.16963005065918, + "learning_rate": 4.257838887959764e-06, + "loss": 1.3538355827331543, + "refine_loss": 0.0, + "step": 561 + }, + { + "ce_loss": 0.4460237920284271, + "epoch": 0.20774420109047223, + "grad_norm": 13.857954025268555, + "learning_rate": 4.24182841706586e-06, + "loss": 1.7840951681137085, + "refine_loss": 0.0, + "step": 562 + }, + { + "ce_loss": 0.45212459564208984, + "epoch": 0.20811385269383606, + "grad_norm": 10.292985916137695, + "learning_rate": 4.2258258990331015e-06, + "loss": 1.8084983825683594, + "refine_loss": 0.0, + "step": 563 + }, + { + "ce_loss": 0.5922918319702148, + "epoch": 0.2084835042971999, + "grad_norm": 8.808961868286133, + "learning_rate": 4.209831501720328e-06, + "loss": 2.3691673278808594, + "refine_loss": 0.0, + "step": 564 + }, + { + "ce_loss": 0.3115156888961792, + "epoch": 0.20885315590056372, + "grad_norm": 10.01165771484375, + "learning_rate": 4.1938453929012014e-06, + "loss": 1.2460627555847168, + "refine_loss": 0.0, + "step": 565 + }, + { + "ce_loss": 0.39396190643310547, + "epoch": 0.20922280750392755, + "grad_norm": 7.76950740814209, + "learning_rate": 4.177867740262437e-06, + "loss": 1.5758476257324219, + "refine_loss": 0.0, + "step": 566 + }, + { + "ce_loss": 0.5523529052734375, + "epoch": 0.20959245910729138, + "grad_norm": 9.470449447631836, + "learning_rate": 4.16189871140205e-06, + "loss": 2.20941162109375, + "refine_loss": 0.0, + "step": 567 + }, + { + "ce_loss": 0.47641921043395996, + "epoch": 0.2099621107106552, + "grad_norm": 9.065677642822266, + "learning_rate": 4.145938473827598e-06, + "loss": 1.9056768417358398, + "refine_loss": 0.0, + "step": 568 + }, + { + "ce_loss": 0.4563819169998169, + "epoch": 0.21033176231401904, + "grad_norm": 10.14564037322998, + "learning_rate": 4.129987194954421e-06, + "loss": 1.8255276679992676, + "refine_loss": 0.0, + "step": 569 + }, + { + "ce_loss": 0.5147643089294434, + "epoch": 0.21070141391738287, + "grad_norm": 12.83034896850586, + "learning_rate": 4.1140450421038865e-06, + "loss": 2.0590572357177734, + "refine_loss": 0.0, + "step": 570 + }, + { + "ce_loss": 0.4355369210243225, + "epoch": 0.2110710655207467, + "grad_norm": 8.789459228515625, + "learning_rate": 4.098112182501633e-06, + "loss": 1.74214768409729, + "refine_loss": 0.0, + "step": 571 + }, + { + "ce_loss": 0.39447200298309326, + "epoch": 0.21144071712411053, + "grad_norm": 10.074094772338867, + "learning_rate": 4.08218878327582e-06, + "loss": 1.577888011932373, + "refine_loss": 0.0, + "step": 572 + }, + { + "ce_loss": 0.46988534927368164, + "epoch": 0.21181036872747436, + "grad_norm": 11.248330116271973, + "learning_rate": 4.066275011455369e-06, + "loss": 1.8795413970947266, + "refine_loss": 0.0, + "step": 573 + }, + { + "ce_loss": 0.42568179965019226, + "epoch": 0.2121800203308382, + "grad_norm": 9.135637283325195, + "learning_rate": 4.050371033968216e-06, + "loss": 1.702727198600769, + "refine_loss": 0.0, + "step": 574 + }, + { + "ce_loss": 0.43598151206970215, + "epoch": 0.21254967193420202, + "grad_norm": 8.788455963134766, + "learning_rate": 4.034477017639561e-06, + "loss": 1.7439260482788086, + "refine_loss": 0.0, + "step": 575 + }, + { + "ce_loss": 0.4418013095855713, + "epoch": 0.21291932353756585, + "grad_norm": 12.566743850708008, + "learning_rate": 4.018593129190113e-06, + "loss": 1.7672052383422852, + "refine_loss": 0.0, + "step": 576 + }, + { + "ce_loss": 0.5104196071624756, + "epoch": 0.21328897514092968, + "grad_norm": 9.829388618469238, + "learning_rate": 4.002719535234346e-06, + "loss": 2.0416784286499023, + "refine_loss": 0.0, + "step": 577 + }, + { + "ce_loss": 0.4454650282859802, + "epoch": 0.2136586267442935, + "grad_norm": 8.041893005371094, + "learning_rate": 3.98685640227875e-06, + "loss": 1.781860113143921, + "refine_loss": 0.0, + "step": 578 + }, + { + "ce_loss": 0.43763017654418945, + "epoch": 0.21402827834765734, + "grad_norm": 8.58592414855957, + "learning_rate": 3.9710038967200825e-06, + "loss": 1.7505207061767578, + "refine_loss": 0.0, + "step": 579 + }, + { + "ce_loss": 0.459678053855896, + "epoch": 0.21439792995102117, + "grad_norm": 8.408544540405273, + "learning_rate": 3.955162184843625e-06, + "loss": 1.838712215423584, + "refine_loss": 0.0, + "step": 580 + }, + { + "ce_loss": 0.5614733695983887, + "epoch": 0.214767581554385, + "grad_norm": 9.449712753295898, + "learning_rate": 3.93933143282144e-06, + "loss": 2.2458934783935547, + "refine_loss": 0.0, + "step": 581 + }, + { + "ce_loss": 0.38466882705688477, + "epoch": 0.21513723315774883, + "grad_norm": 9.096491813659668, + "learning_rate": 3.9235118067106255e-06, + "loss": 1.538675308227539, + "refine_loss": 0.0, + "step": 582 + }, + { + "ce_loss": 0.35889458656311035, + "epoch": 0.21550688476111265, + "grad_norm": 8.741055488586426, + "learning_rate": 3.907703472451574e-06, + "loss": 1.4355783462524414, + "refine_loss": 0.0, + "step": 583 + }, + { + "ce_loss": 0.5473451614379883, + "epoch": 0.21587653636447648, + "grad_norm": 13.460650444030762, + "learning_rate": 3.89190659586623e-06, + "loss": 2.189380645751953, + "refine_loss": 0.0, + "step": 584 + }, + { + "ce_loss": 0.512986421585083, + "epoch": 0.21624618796784031, + "grad_norm": 8.84572696685791, + "learning_rate": 3.8761213426563546e-06, + "loss": 2.051945686340332, + "refine_loss": 0.0, + "step": 585 + }, + { + "ce_loss": 0.3742804527282715, + "epoch": 0.21661583957120414, + "grad_norm": 7.919396877288818, + "learning_rate": 3.8603478784017845e-06, + "loss": 1.497121810913086, + "refine_loss": 0.0, + "step": 586 + }, + { + "ce_loss": 0.5082879066467285, + "epoch": 0.21698549117456797, + "grad_norm": 10.463650703430176, + "learning_rate": 3.8445863685586946e-06, + "loss": 2.033151626586914, + "refine_loss": 0.0, + "step": 587 + }, + { + "ce_loss": 0.3962143659591675, + "epoch": 0.2173551427779318, + "grad_norm": 9.000871658325195, + "learning_rate": 3.828836978457868e-06, + "loss": 1.58485746383667, + "refine_loss": 0.0, + "step": 588 + }, + { + "ce_loss": 0.5124468803405762, + "epoch": 0.21772479438129563, + "grad_norm": 9.567322731018066, + "learning_rate": 3.8130998733029517e-06, + "loss": 2.0497875213623047, + "refine_loss": 0.0, + "step": 589 + }, + { + "ce_loss": 0.42940235137939453, + "epoch": 0.21809444598465946, + "grad_norm": 10.250985145568848, + "learning_rate": 3.7973752181687336e-06, + "loss": 1.7176094055175781, + "refine_loss": 0.0, + "step": 590 + }, + { + "ce_loss": 0.5333443880081177, + "epoch": 0.2184640975880233, + "grad_norm": 11.833946228027344, + "learning_rate": 3.7816631779994018e-06, + "loss": 2.1333775520324707, + "refine_loss": 0.0, + "step": 591 + }, + { + "ce_loss": 0.3664134740829468, + "epoch": 0.21883374919138712, + "grad_norm": 7.195476055145264, + "learning_rate": 3.7659639176068287e-06, + "loss": 1.465653896331787, + "refine_loss": 0.0, + "step": 592 + }, + { + "ce_loss": 0.40147876739501953, + "epoch": 0.21920340079475095, + "grad_norm": 7.480161666870117, + "learning_rate": 3.7502776016688234e-06, + "loss": 1.6059150695800781, + "refine_loss": 0.0, + "step": 593 + }, + { + "ce_loss": 0.3708341121673584, + "epoch": 0.21957305239811478, + "grad_norm": 8.001352310180664, + "learning_rate": 3.734604394727419e-06, + "loss": 1.4833364486694336, + "refine_loss": 0.0, + "step": 594 + }, + { + "ce_loss": 0.3476853370666504, + "epoch": 0.2199427040014786, + "grad_norm": 8.115730285644531, + "learning_rate": 3.7189444611871383e-06, + "loss": 1.3907413482666016, + "refine_loss": 0.0, + "step": 595 + }, + { + "ce_loss": 0.4777413010597229, + "epoch": 0.22031235560484244, + "grad_norm": 21.444746017456055, + "learning_rate": 3.703297965313275e-06, + "loss": 1.9109652042388916, + "refine_loss": 0.0, + "step": 596 + }, + { + "ce_loss": 0.5886745452880859, + "epoch": 0.22068200720820627, + "grad_norm": 8.601838111877441, + "learning_rate": 3.6876650712301654e-06, + "loss": 2.3546981811523438, + "refine_loss": 0.0, + "step": 597 + }, + { + "ce_loss": 0.3707760274410248, + "epoch": 0.2210516588115701, + "grad_norm": 8.23505973815918, + "learning_rate": 3.6720459429194743e-06, + "loss": 1.4831041097640991, + "refine_loss": 0.0, + "step": 598 + }, + { + "ce_loss": 0.45897746086120605, + "epoch": 0.22142131041493393, + "grad_norm": 8.57390308380127, + "learning_rate": 3.656440744218464e-06, + "loss": 1.8359098434448242, + "refine_loss": 0.0, + "step": 599 + }, + { + "ce_loss": 0.4887204170227051, + "epoch": 0.22179096201829776, + "grad_norm": 10.496850967407227, + "learning_rate": 3.6408496388182857e-06, + "loss": 1.9548816680908203, + "refine_loss": 0.0, + "step": 600 + }, + { + "ce_loss": 0.42523789405822754, + "epoch": 0.2221606136216616, + "grad_norm": 13.75538444519043, + "learning_rate": 3.6252727902622575e-06, + "loss": 1.7009515762329102, + "refine_loss": 0.0, + "step": 601 + }, + { + "ce_loss": 0.38059282302856445, + "epoch": 0.22253026522502542, + "grad_norm": 8.67073917388916, + "learning_rate": 3.6097103619441505e-06, + "loss": 1.5223712921142578, + "refine_loss": 0.0, + "step": 602 + }, + { + "ce_loss": 0.42409396171569824, + "epoch": 0.22289991682838925, + "grad_norm": 8.800985336303711, + "learning_rate": 3.594162517106472e-06, + "loss": 1.696375846862793, + "refine_loss": 0.0, + "step": 603 + }, + { + "ce_loss": 0.5710339546203613, + "epoch": 0.22326956843175308, + "grad_norm": 9.314031600952148, + "learning_rate": 3.578629418838757e-06, + "loss": 2.2841358184814453, + "refine_loss": 0.0, + "step": 604 + }, + { + "ce_loss": 0.4749301075935364, + "epoch": 0.2236392200351169, + "grad_norm": 7.420982360839844, + "learning_rate": 3.5631112300758595e-06, + "loss": 1.8997204303741455, + "refine_loss": 0.0, + "step": 605 + }, + { + "ce_loss": 0.6415605545043945, + "epoch": 0.22400887163848074, + "grad_norm": 10.44614028930664, + "learning_rate": 3.5476081135962335e-06, + "loss": 2.566242218017578, + "refine_loss": 0.0, + "step": 606 + }, + { + "ce_loss": 0.5835795402526855, + "epoch": 0.22437852324184457, + "grad_norm": 8.498126983642578, + "learning_rate": 3.532120232020236e-06, + "loss": 2.334318161010742, + "refine_loss": 0.0, + "step": 607 + }, + { + "ce_loss": 0.38862597942352295, + "epoch": 0.2247481748452084, + "grad_norm": 9.163359642028809, + "learning_rate": 3.516647747808417e-06, + "loss": 1.5545039176940918, + "refine_loss": 0.0, + "step": 608 + }, + { + "ce_loss": 0.3697335720062256, + "epoch": 0.22511782644857223, + "grad_norm": 8.744912147521973, + "learning_rate": 3.5011908232598124e-06, + "loss": 1.4789342880249023, + "refine_loss": 0.0, + "step": 609 + }, + { + "ce_loss": 0.5536946058273315, + "epoch": 0.22548747805193606, + "grad_norm": 11.425541877746582, + "learning_rate": 3.4857496205102475e-06, + "loss": 2.214778423309326, + "refine_loss": 0.0, + "step": 610 + }, + { + "ce_loss": 0.4524264335632324, + "epoch": 0.2258571296552999, + "grad_norm": 9.29116153717041, + "learning_rate": 3.4703243015306314e-06, + "loss": 1.8097057342529297, + "refine_loss": 0.0, + "step": 611 + }, + { + "ce_loss": 0.4114772081375122, + "epoch": 0.22622678125866372, + "grad_norm": 9.200993537902832, + "learning_rate": 3.4549150281252635e-06, + "loss": 1.6459088325500488, + "refine_loss": 0.0, + "step": 612 + }, + { + "ce_loss": 0.4839348793029785, + "epoch": 0.22659643286202755, + "grad_norm": 8.86767292022705, + "learning_rate": 3.4395219619301288e-06, + "loss": 1.935739517211914, + "refine_loss": 0.0, + "step": 613 + }, + { + "ce_loss": 0.6383647918701172, + "epoch": 0.22696608446539138, + "grad_norm": 9.192737579345703, + "learning_rate": 3.4241452644112085e-06, + "loss": 2.5534591674804688, + "refine_loss": 0.0, + "step": 614 + }, + { + "ce_loss": 0.5665136575698853, + "epoch": 0.2273357360687552, + "grad_norm": 9.466482162475586, + "learning_rate": 3.4087850968627823e-06, + "loss": 2.266054630279541, + "refine_loss": 0.0, + "step": 615 + }, + { + "ce_loss": 0.4255070686340332, + "epoch": 0.22770538767211904, + "grad_norm": 9.708521842956543, + "learning_rate": 3.3934416204057396e-06, + "loss": 1.7020282745361328, + "refine_loss": 0.0, + "step": 616 + }, + { + "ce_loss": 0.5681629180908203, + "epoch": 0.22807503927548287, + "grad_norm": 14.281062126159668, + "learning_rate": 3.3781149959858894e-06, + "loss": 2.2726516723632812, + "refine_loss": 0.0, + "step": 617 + }, + { + "ce_loss": 0.511265754699707, + "epoch": 0.2284446908788467, + "grad_norm": 13.803600311279297, + "learning_rate": 3.3628053843722674e-06, + "loss": 2.045063018798828, + "refine_loss": 0.0, + "step": 618 + }, + { + "ce_loss": 0.4879720211029053, + "epoch": 0.22881434248221053, + "grad_norm": 12.706273078918457, + "learning_rate": 3.3475129461554567e-06, + "loss": 1.951888084411621, + "refine_loss": 0.0, + "step": 619 + }, + { + "ce_loss": 0.5003206729888916, + "epoch": 0.22918399408557436, + "grad_norm": 11.200728416442871, + "learning_rate": 3.3322378417458985e-06, + "loss": 2.0012826919555664, + "refine_loss": 0.0, + "step": 620 + }, + { + "ce_loss": 0.5375955104827881, + "epoch": 0.22955364568893818, + "grad_norm": 11.125018119812012, + "learning_rate": 3.3169802313722076e-06, + "loss": 2.1503820419311523, + "refine_loss": 0.0, + "step": 621 + }, + { + "ce_loss": 0.4137754440307617, + "epoch": 0.22992329729230201, + "grad_norm": 10.014217376708984, + "learning_rate": 3.3017402750794976e-06, + "loss": 1.6551017761230469, + "refine_loss": 0.0, + "step": 622 + }, + { + "ce_loss": 0.40333986282348633, + "epoch": 0.23029294889566584, + "grad_norm": 10.53635311126709, + "learning_rate": 3.2865181327277007e-06, + "loss": 1.6133594512939453, + "refine_loss": 0.0, + "step": 623 + }, + { + "ce_loss": 0.5518575310707092, + "epoch": 0.23066260049902967, + "grad_norm": 13.531801223754883, + "learning_rate": 3.271313963989886e-06, + "loss": 2.207430124282837, + "refine_loss": 0.0, + "step": 624 + }, + { + "ce_loss": 0.44999027252197266, + "epoch": 0.2310322521023935, + "grad_norm": 8.873538970947266, + "learning_rate": 3.2561279283505888e-06, + "loss": 1.7999610900878906, + "refine_loss": 0.0, + "step": 625 + }, + { + "ce_loss": 0.4755483865737915, + "epoch": 0.23140190370575733, + "grad_norm": 14.836163520812988, + "learning_rate": 3.240960185104137e-06, + "loss": 1.902193546295166, + "refine_loss": 0.0, + "step": 626 + }, + { + "ce_loss": 0.541684627532959, + "epoch": 0.23177155530912116, + "grad_norm": 10.253257751464844, + "learning_rate": 3.2258108933529808e-06, + "loss": 2.166738510131836, + "refine_loss": 0.0, + "step": 627 + }, + { + "ce_loss": 0.41130292415618896, + "epoch": 0.232141206912485, + "grad_norm": 11.220494270324707, + "learning_rate": 3.2106802120060197e-06, + "loss": 1.6452116966247559, + "refine_loss": 0.0, + "step": 628 + }, + { + "ce_loss": 0.44029492139816284, + "epoch": 0.23251085851584882, + "grad_norm": 9.612217903137207, + "learning_rate": 3.195568299776945e-06, + "loss": 1.7611796855926514, + "refine_loss": 0.0, + "step": 629 + }, + { + "ce_loss": 0.5514917373657227, + "epoch": 0.23288051011921265, + "grad_norm": 9.910693168640137, + "learning_rate": 3.180475315182563e-06, + "loss": 2.2059669494628906, + "refine_loss": 0.0, + "step": 630 + }, + { + "ce_loss": 0.4506186246871948, + "epoch": 0.23325016172257648, + "grad_norm": 10.116756439208984, + "learning_rate": 3.16540141654114e-06, + "loss": 1.8024744987487793, + "refine_loss": 0.0, + "step": 631 + }, + { + "ce_loss": 0.3489036560058594, + "epoch": 0.2336198133259403, + "grad_norm": 8.901446342468262, + "learning_rate": 3.1503467619707407e-06, + "loss": 1.3956146240234375, + "refine_loss": 0.0, + "step": 632 + }, + { + "ce_loss": 0.7407140731811523, + "epoch": 0.23398946492930414, + "grad_norm": 13.239083290100098, + "learning_rate": 3.1353115093875676e-06, + "loss": 2.9628562927246094, + "refine_loss": 0.0, + "step": 633 + }, + { + "ce_loss": 0.37608802318573, + "epoch": 0.23435911653266797, + "grad_norm": 13.846875190734863, + "learning_rate": 3.1202958165043053e-06, + "loss": 1.50435209274292, + "refine_loss": 0.0, + "step": 634 + }, + { + "ce_loss": 0.366794228553772, + "epoch": 0.2347287681360318, + "grad_norm": 9.31505298614502, + "learning_rate": 3.1052998408284664e-06, + "loss": 1.467176914215088, + "refine_loss": 0.0, + "step": 635 + }, + { + "ce_loss": 0.48948192596435547, + "epoch": 0.23509841973939563, + "grad_norm": 9.515790939331055, + "learning_rate": 3.090323739660742e-06, + "loss": 1.9579277038574219, + "refine_loss": 0.0, + "step": 636 + }, + { + "ce_loss": 0.5225820541381836, + "epoch": 0.23546807134275946, + "grad_norm": 10.017742156982422, + "learning_rate": 3.0753676700933448e-06, + "loss": 2.0903282165527344, + "refine_loss": 0.0, + "step": 637 + }, + { + "ce_loss": 0.47458934783935547, + "epoch": 0.2358377229461233, + "grad_norm": 9.60313606262207, + "learning_rate": 3.060431789008368e-06, + "loss": 1.8983573913574219, + "refine_loss": 0.0, + "step": 638 + }, + { + "ce_loss": 0.4660773277282715, + "epoch": 0.23620737454948712, + "grad_norm": 13.231803894042969, + "learning_rate": 3.045516253076137e-06, + "loss": 1.864309310913086, + "refine_loss": 0.0, + "step": 639 + }, + { + "ce_loss": 0.502723217010498, + "epoch": 0.23657702615285095, + "grad_norm": 9.043842315673828, + "learning_rate": 3.0306212187535653e-06, + "loss": 2.010892868041992, + "refine_loss": 0.0, + "step": 640 + }, + { + "ce_loss": 0.5534672737121582, + "epoch": 0.23694667775621478, + "grad_norm": 10.099410057067871, + "learning_rate": 3.0157468422825148e-06, + "loss": 2.213869094848633, + "refine_loss": 0.0, + "step": 641 + }, + { + "ce_loss": 0.41518521308898926, + "epoch": 0.2373163293595786, + "grad_norm": 8.785932540893555, + "learning_rate": 3.000893279688155e-06, + "loss": 1.660740852355957, + "refine_loss": 0.0, + "step": 642 + }, + { + "ce_loss": 0.3543815612792969, + "epoch": 0.23768598096294244, + "grad_norm": 8.55467700958252, + "learning_rate": 2.9860606867773323e-06, + "loss": 1.4175262451171875, + "refine_loss": 0.0, + "step": 643 + }, + { + "ce_loss": 0.5601745843887329, + "epoch": 0.23805563256630627, + "grad_norm": 8.449277877807617, + "learning_rate": 2.9712492191369245e-06, + "loss": 2.2406983375549316, + "refine_loss": 0.0, + "step": 644 + }, + { + "ce_loss": 0.5257341861724854, + "epoch": 0.2384252841696701, + "grad_norm": 10.872962951660156, + "learning_rate": 2.9564590321322206e-06, + "loss": 2.1029367446899414, + "refine_loss": 0.0, + "step": 645 + }, + { + "ce_loss": 0.5637867450714111, + "epoch": 0.23879493577303393, + "grad_norm": 14.378861427307129, + "learning_rate": 2.9416902809052817e-06, + "loss": 2.2551469802856445, + "refine_loss": 0.0, + "step": 646 + }, + { + "ce_loss": 0.4209864139556885, + "epoch": 0.23916458737639776, + "grad_norm": 10.120001792907715, + "learning_rate": 2.9269431203733213e-06, + "loss": 1.683945655822754, + "refine_loss": 0.0, + "step": 647 + }, + { + "ce_loss": 0.5486574172973633, + "epoch": 0.2395342389797616, + "grad_norm": 11.293456077575684, + "learning_rate": 2.912217705227075e-06, + "loss": 2.194629669189453, + "refine_loss": 0.0, + "step": 648 + }, + { + "ce_loss": 0.4471569061279297, + "epoch": 0.23990389058312542, + "grad_norm": 11.103870391845703, + "learning_rate": 2.8975141899291777e-06, + "loss": 1.7886276245117188, + "refine_loss": 0.0, + "step": 649 + }, + { + "ce_loss": 0.49677175283432007, + "epoch": 0.24027354218648925, + "grad_norm": 15.573419570922852, + "learning_rate": 2.882832728712551e-06, + "loss": 1.9870870113372803, + "refine_loss": 0.0, + "step": 650 + }, + { + "ce_loss": 0.4476282596588135, + "epoch": 0.24064319378985308, + "grad_norm": 8.213716506958008, + "learning_rate": 2.868173475578772e-06, + "loss": 1.790513038635254, + "refine_loss": 0.0, + "step": 651 + }, + { + "ce_loss": 0.5065560340881348, + "epoch": 0.2410128453932169, + "grad_norm": 11.699137687683105, + "learning_rate": 2.8535365842964713e-06, + "loss": 2.026224136352539, + "refine_loss": 0.0, + "step": 652 + }, + { + "ce_loss": 0.4481391906738281, + "epoch": 0.2413824969965807, + "grad_norm": 14.39682388305664, + "learning_rate": 2.838922208399712e-06, + "loss": 1.7925567626953125, + "refine_loss": 0.0, + "step": 653 + }, + { + "ce_loss": 0.44654178619384766, + "epoch": 0.24175214859994454, + "grad_norm": 8.698363304138184, + "learning_rate": 2.8243305011863843e-06, + "loss": 1.7861671447753906, + "refine_loss": 0.0, + "step": 654 + }, + { + "ce_loss": 0.528839111328125, + "epoch": 0.24212180020330837, + "grad_norm": 10.794992446899414, + "learning_rate": 2.8097616157165886e-06, + "loss": 2.1153564453125, + "refine_loss": 0.0, + "step": 655 + }, + { + "ce_loss": 0.34007859230041504, + "epoch": 0.2424914518066722, + "grad_norm": 8.907689094543457, + "learning_rate": 2.7952157048110406e-06, + "loss": 1.3603143692016602, + "refine_loss": 0.0, + "step": 656 + }, + { + "ce_loss": 0.42537781596183777, + "epoch": 0.24286110341003603, + "grad_norm": 8.759946823120117, + "learning_rate": 2.780692921049465e-06, + "loss": 1.701511263847351, + "refine_loss": 0.0, + "step": 657 + }, + { + "ce_loss": 0.36801862716674805, + "epoch": 0.24323075501339986, + "grad_norm": 8.257847785949707, + "learning_rate": 2.7661934167689887e-06, + "loss": 1.4720745086669922, + "refine_loss": 0.0, + "step": 658 + }, + { + "ce_loss": 0.5431897640228271, + "epoch": 0.2436004066167637, + "grad_norm": 10.892056465148926, + "learning_rate": 2.751717344062552e-06, + "loss": 2.1727590560913086, + "refine_loss": 0.0, + "step": 659 + }, + { + "ce_loss": 0.3836963176727295, + "epoch": 0.24397005822012752, + "grad_norm": 11.024449348449707, + "learning_rate": 2.7372648547773063e-06, + "loss": 1.534785270690918, + "refine_loss": 0.0, + "step": 660 + }, + { + "ce_loss": 0.5555763244628906, + "epoch": 0.24433970982349135, + "grad_norm": 11.927938461303711, + "learning_rate": 2.722836100513027e-06, + "loss": 2.2223052978515625, + "refine_loss": 0.0, + "step": 661 + }, + { + "ce_loss": 0.5236469507217407, + "epoch": 0.24470936142685518, + "grad_norm": 9.888964653015137, + "learning_rate": 2.7084312326205164e-06, + "loss": 2.094587802886963, + "refine_loss": 0.0, + "step": 662 + }, + { + "ce_loss": 0.5719296336174011, + "epoch": 0.245079013030219, + "grad_norm": 8.936216354370117, + "learning_rate": 2.6940504022000248e-06, + "loss": 2.2877185344696045, + "refine_loss": 0.0, + "step": 663 + }, + { + "ce_loss": 0.5035660266876221, + "epoch": 0.24544866463358284, + "grad_norm": 10.685040473937988, + "learning_rate": 2.6796937600996587e-06, + "loss": 2.0142641067504883, + "refine_loss": 0.0, + "step": 664 + }, + { + "ce_loss": 0.5893560647964478, + "epoch": 0.24581831623694667, + "grad_norm": 11.781888961791992, + "learning_rate": 2.665361456913797e-06, + "loss": 2.357424259185791, + "refine_loss": 0.0, + "step": 665 + }, + { + "ce_loss": 0.5677556991577148, + "epoch": 0.2461879678403105, + "grad_norm": 9.863702774047852, + "learning_rate": 2.6510536429815224e-06, + "loss": 2.2710227966308594, + "refine_loss": 0.0, + "step": 666 + }, + { + "ce_loss": 0.5139479637145996, + "epoch": 0.24655761944367433, + "grad_norm": 9.538778305053711, + "learning_rate": 2.6367704683850293e-06, + "loss": 2.0557918548583984, + "refine_loss": 0.0, + "step": 667 + }, + { + "ce_loss": 0.42751264572143555, + "epoch": 0.24692727104703815, + "grad_norm": 8.85438346862793, + "learning_rate": 2.622512082948063e-06, + "loss": 1.7100505828857422, + "refine_loss": 0.0, + "step": 668 + }, + { + "ce_loss": 0.5406866073608398, + "epoch": 0.24729692265040198, + "grad_norm": 8.949142456054688, + "learning_rate": 2.6082786362343377e-06, + "loss": 2.1627464294433594, + "refine_loss": 0.0, + "step": 669 + }, + { + "ce_loss": 0.3738546371459961, + "epoch": 0.24766657425376581, + "grad_norm": 8.29307746887207, + "learning_rate": 2.594070277545975e-06, + "loss": 1.4954185485839844, + "refine_loss": 0.0, + "step": 670 + }, + { + "ce_loss": 0.4725918173789978, + "epoch": 0.24803622585712964, + "grad_norm": 8.782156944274902, + "learning_rate": 2.5798871559219362e-06, + "loss": 1.8903672695159912, + "refine_loss": 0.0, + "step": 671 + }, + { + "ce_loss": 0.4437835216522217, + "epoch": 0.24840587746049347, + "grad_norm": 9.681473731994629, + "learning_rate": 2.5657294201364526e-06, + "loss": 1.7751340866088867, + "refine_loss": 0.0, + "step": 672 + }, + { + "ce_loss": 0.46013665199279785, + "epoch": 0.2487755290638573, + "grad_norm": 13.692179679870605, + "learning_rate": 2.551597218697476e-06, + "loss": 1.8405466079711914, + "refine_loss": 0.0, + "step": 673 + }, + { + "ce_loss": 0.43346643447875977, + "epoch": 0.24914518066722113, + "grad_norm": 8.348217964172363, + "learning_rate": 2.5374906998451094e-06, + "loss": 1.733865737915039, + "refine_loss": 0.0, + "step": 674 + }, + { + "ce_loss": 0.539243221282959, + "epoch": 0.24951483227058496, + "grad_norm": 9.195411682128906, + "learning_rate": 2.5234100115500643e-06, + "loss": 2.156972885131836, + "refine_loss": 0.0, + "step": 675 + }, + { + "ce_loss": 0.5982260704040527, + "epoch": 0.2498844838739488, + "grad_norm": 10.044342994689941, + "learning_rate": 2.5093553015120937e-06, + "loss": 2.392904281616211, + "refine_loss": 0.0, + "step": 676 + }, + { + "ce_loss": 0.4976656436920166, + "epoch": 0.2502541354773126, + "grad_norm": 11.532010078430176, + "learning_rate": 2.4953267171584573e-06, + "loss": 1.9906625747680664, + "refine_loss": 0.0, + "step": 677 + }, + { + "ce_loss": 0.537226676940918, + "epoch": 0.25062378708067645, + "grad_norm": 13.27495002746582, + "learning_rate": 2.4813244056423692e-06, + "loss": 2.148906707763672, + "refine_loss": 0.0, + "step": 678 + }, + { + "ce_loss": 0.4296457767486572, + "epoch": 0.2509934386840403, + "grad_norm": 11.49736499786377, + "learning_rate": 2.467348513841447e-06, + "loss": 1.718583106994629, + "refine_loss": 0.0, + "step": 679 + }, + { + "ce_loss": 0.4194897413253784, + "epoch": 0.2513630902874041, + "grad_norm": 10.011813163757324, + "learning_rate": 2.4533991883561868e-06, + "loss": 1.6779589653015137, + "refine_loss": 0.0, + "step": 680 + }, + { + "ce_loss": 0.4001030921936035, + "epoch": 0.25173274189076794, + "grad_norm": 8.247354507446289, + "learning_rate": 2.439476575508408e-06, + "loss": 1.600412368774414, + "refine_loss": 0.0, + "step": 681 + }, + { + "ce_loss": 0.4442024230957031, + "epoch": 0.25210239349413177, + "grad_norm": 8.443912506103516, + "learning_rate": 2.425580821339733e-06, + "loss": 1.7768096923828125, + "refine_loss": 0.0, + "step": 682 + }, + { + "ce_loss": 0.6042484045028687, + "epoch": 0.2524720450974956, + "grad_norm": 11.898344993591309, + "learning_rate": 2.4117120716100484e-06, + "loss": 2.4169936180114746, + "refine_loss": 0.0, + "step": 683 + }, + { + "ce_loss": 0.5521482229232788, + "epoch": 0.25284169670085943, + "grad_norm": 10.034185409545898, + "learning_rate": 2.3978704717959777e-06, + "loss": 2.2085928916931152, + "refine_loss": 0.0, + "step": 684 + }, + { + "ce_loss": 0.4379132390022278, + "epoch": 0.25321134830422326, + "grad_norm": 9.909683227539062, + "learning_rate": 2.38405616708935e-06, + "loss": 1.7516529560089111, + "refine_loss": 0.0, + "step": 685 + }, + { + "ce_loss": 0.3906847834587097, + "epoch": 0.2535809999075871, + "grad_norm": 7.81617546081543, + "learning_rate": 2.3702693023956853e-06, + "loss": 1.5627391338348389, + "refine_loss": 0.0, + "step": 686 + }, + { + "ce_loss": 0.6013399362564087, + "epoch": 0.2539506515109509, + "grad_norm": 9.647492408752441, + "learning_rate": 2.356510022332674e-06, + "loss": 2.4053597450256348, + "refine_loss": 0.0, + "step": 687 + }, + { + "ce_loss": 0.5429067611694336, + "epoch": 0.25432030311431475, + "grad_norm": 8.889986038208008, + "learning_rate": 2.342778471228648e-06, + "loss": 2.1716270446777344, + "refine_loss": 0.0, + "step": 688 + }, + { + "ce_loss": 0.5254337787628174, + "epoch": 0.2546899547176786, + "grad_norm": 9.132460594177246, + "learning_rate": 2.329074793121085e-06, + "loss": 2.1017351150512695, + "refine_loss": 0.0, + "step": 689 + }, + { + "ce_loss": 0.4354567527770996, + "epoch": 0.2550596063210424, + "grad_norm": 10.492297172546387, + "learning_rate": 2.315399131755081e-06, + "loss": 1.7418270111083984, + "refine_loss": 0.0, + "step": 690 + }, + { + "ce_loss": 0.5492331981658936, + "epoch": 0.25542925792440624, + "grad_norm": 8.801657676696777, + "learning_rate": 2.301751630581855e-06, + "loss": 2.196932792663574, + "refine_loss": 0.0, + "step": 691 + }, + { + "ce_loss": 0.39961135387420654, + "epoch": 0.25579890952777007, + "grad_norm": 8.714338302612305, + "learning_rate": 2.2881324327572336e-06, + "loss": 1.5984454154968262, + "refine_loss": 0.0, + "step": 692 + }, + { + "ce_loss": 0.5071737766265869, + "epoch": 0.2561685611311339, + "grad_norm": 18.82767677307129, + "learning_rate": 2.274541681140159e-06, + "loss": 2.0286951065063477, + "refine_loss": 0.0, + "step": 693 + }, + { + "ce_loss": 0.4550451338291168, + "epoch": 0.25653821273449773, + "grad_norm": 11.095293998718262, + "learning_rate": 2.260979518291186e-06, + "loss": 1.8201805353164673, + "refine_loss": 0.0, + "step": 694 + }, + { + "ce_loss": 0.5511274337768555, + "epoch": 0.25690786433786156, + "grad_norm": 10.300556182861328, + "learning_rate": 2.2474460864709825e-06, + "loss": 2.204509735107422, + "refine_loss": 0.0, + "step": 695 + }, + { + "ce_loss": 0.5361535549163818, + "epoch": 0.2572775159412254, + "grad_norm": 11.324986457824707, + "learning_rate": 2.233941527638848e-06, + "loss": 2.1446142196655273, + "refine_loss": 0.0, + "step": 696 + }, + { + "ce_loss": 0.5879361629486084, + "epoch": 0.2576471675445892, + "grad_norm": 13.605055809020996, + "learning_rate": 2.2204659834512095e-06, + "loss": 2.3517446517944336, + "refine_loss": 0.0, + "step": 697 + }, + { + "ce_loss": 0.30018556118011475, + "epoch": 0.25801681914795305, + "grad_norm": 9.901082038879395, + "learning_rate": 2.207019595260154e-06, + "loss": 1.200742244720459, + "refine_loss": 0.0, + "step": 698 + }, + { + "ce_loss": 0.43702661991119385, + "epoch": 0.2583864707513169, + "grad_norm": 12.480359077453613, + "learning_rate": 2.1936025041119268e-06, + "loss": 1.7481064796447754, + "refine_loss": 0.0, + "step": 699 + }, + { + "ce_loss": 0.561983585357666, + "epoch": 0.2587561223546807, + "grad_norm": 11.725361824035645, + "learning_rate": 2.1802148507454675e-06, + "loss": 2.247934341430664, + "refine_loss": 0.0, + "step": 700 + }, + { + "ce_loss": 0.47890472412109375, + "epoch": 0.25912577395804454, + "grad_norm": 11.109660148620605, + "learning_rate": 2.1668567755909257e-06, + "loss": 1.915618896484375, + "refine_loss": 0.0, + "step": 701 + }, + { + "ce_loss": 0.423382043838501, + "epoch": 0.25949542556140837, + "grad_norm": 10.525919914245605, + "learning_rate": 2.1535284187681866e-06, + "loss": 1.693528175354004, + "refine_loss": 0.0, + "step": 702 + }, + { + "ce_loss": 0.5219349265098572, + "epoch": 0.2598650771647722, + "grad_norm": 11.063026428222656, + "learning_rate": 2.140229920085409e-06, + "loss": 2.0877397060394287, + "refine_loss": 0.0, + "step": 703 + }, + { + "ce_loss": 0.3990159034729004, + "epoch": 0.260234728768136, + "grad_norm": 8.916237831115723, + "learning_rate": 2.1269614190375477e-06, + "loss": 1.5960636138916016, + "refine_loss": 0.0, + "step": 704 + }, + { + "ce_loss": 0.5033814907073975, + "epoch": 0.26060438037149986, + "grad_norm": 10.462356567382812, + "learning_rate": 2.1137230548049042e-06, + "loss": 2.01352596282959, + "refine_loss": 0.0, + "step": 705 + }, + { + "ce_loss": 0.4525042772293091, + "epoch": 0.2609740319748637, + "grad_norm": 9.597060203552246, + "learning_rate": 2.1005149662516517e-06, + "loss": 1.8100171089172363, + "refine_loss": 0.0, + "step": 706 + }, + { + "ce_loss": 0.4075286388397217, + "epoch": 0.2613436835782275, + "grad_norm": 11.161819458007812, + "learning_rate": 2.08733729192439e-06, + "loss": 1.6301145553588867, + "refine_loss": 0.0, + "step": 707 + }, + { + "ce_loss": 0.3205733299255371, + "epoch": 0.26171333518159134, + "grad_norm": 11.722637176513672, + "learning_rate": 2.07419017005069e-06, + "loss": 1.2822933197021484, + "refine_loss": 0.0, + "step": 708 + }, + { + "ce_loss": 0.5204401016235352, + "epoch": 0.2620829867849552, + "grad_norm": 12.676664352416992, + "learning_rate": 2.061073738537635e-06, + "loss": 2.0817604064941406, + "refine_loss": 0.0, + "step": 709 + }, + { + "ce_loss": 0.3936443328857422, + "epoch": 0.262452638388319, + "grad_norm": 8.586315155029297, + "learning_rate": 2.0479881349703885e-06, + "loss": 1.5745773315429688, + "refine_loss": 0.0, + "step": 710 + }, + { + "ce_loss": 0.4890165328979492, + "epoch": 0.26282228999168283, + "grad_norm": 11.140220642089844, + "learning_rate": 2.0349334966107363e-06, + "loss": 1.9560661315917969, + "refine_loss": 0.0, + "step": 711 + }, + { + "ce_loss": 0.5402286052703857, + "epoch": 0.26319194159504666, + "grad_norm": 10.139958381652832, + "learning_rate": 2.021909960395661e-06, + "loss": 2.160914421081543, + "refine_loss": 0.0, + "step": 712 + }, + { + "ce_loss": 0.42759203910827637, + "epoch": 0.2635615931984105, + "grad_norm": 9.4074068069458, + "learning_rate": 2.0089176629358904e-06, + "loss": 1.7103681564331055, + "refine_loss": 0.0, + "step": 713 + }, + { + "ce_loss": 0.45156294107437134, + "epoch": 0.2639312448017743, + "grad_norm": 11.246671676635742, + "learning_rate": 1.9959567405144825e-06, + "loss": 1.8062517642974854, + "refine_loss": 0.0, + "step": 714 + }, + { + "ce_loss": 0.5093896985054016, + "epoch": 0.26430089640513815, + "grad_norm": 10.922212600708008, + "learning_rate": 1.983027329085377e-06, + "loss": 2.0375587940216064, + "refine_loss": 0.0, + "step": 715 + }, + { + "ce_loss": 0.43191951513290405, + "epoch": 0.264670548008502, + "grad_norm": 10.91669750213623, + "learning_rate": 1.9701295642719836e-06, + "loss": 1.7276780605316162, + "refine_loss": 0.0, + "step": 716 + }, + { + "ce_loss": 0.5398119688034058, + "epoch": 0.2650401996118658, + "grad_norm": 11.079964637756348, + "learning_rate": 1.957263581365749e-06, + "loss": 2.159247875213623, + "refine_loss": 0.0, + "step": 717 + }, + { + "ce_loss": 0.5145858526229858, + "epoch": 0.26540985121522964, + "grad_norm": 12.654104232788086, + "learning_rate": 1.944429515324749e-06, + "loss": 2.0583434104919434, + "refine_loss": 0.0, + "step": 718 + }, + { + "ce_loss": 0.4086298942565918, + "epoch": 0.26577950281859347, + "grad_norm": 9.401398658752441, + "learning_rate": 1.931627500772263e-06, + "loss": 1.6345195770263672, + "refine_loss": 0.0, + "step": 719 + }, + { + "ce_loss": 0.44599854946136475, + "epoch": 0.2661491544219573, + "grad_norm": 10.598443984985352, + "learning_rate": 1.9188576719953635e-06, + "loss": 1.783994197845459, + "refine_loss": 0.0, + "step": 720 + }, + { + "ce_loss": 0.5341277122497559, + "epoch": 0.26651880602532113, + "grad_norm": 16.239736557006836, + "learning_rate": 1.906120162943515e-06, + "loss": 2.1365108489990234, + "refine_loss": 0.0, + "step": 721 + }, + { + "ce_loss": 0.4684973955154419, + "epoch": 0.26688845762868496, + "grad_norm": 10.688444137573242, + "learning_rate": 1.8934151072271573e-06, + "loss": 1.8739895820617676, + "refine_loss": 0.0, + "step": 722 + }, + { + "ce_loss": 0.3530154228210449, + "epoch": 0.2672581092320488, + "grad_norm": 11.699592590332031, + "learning_rate": 1.8807426381163151e-06, + "loss": 1.4120616912841797, + "refine_loss": 0.0, + "step": 723 + }, + { + "ce_loss": 0.42848658561706543, + "epoch": 0.2676277608354126, + "grad_norm": 10.576115608215332, + "learning_rate": 1.8681028885391905e-06, + "loss": 1.7139463424682617, + "refine_loss": 0.0, + "step": 724 + }, + { + "ce_loss": 0.5989351272583008, + "epoch": 0.26799741243877645, + "grad_norm": 11.30154800415039, + "learning_rate": 1.8554959910807773e-06, + "loss": 2.395740509033203, + "refine_loss": 0.0, + "step": 725 + }, + { + "ce_loss": 0.5336481332778931, + "epoch": 0.2683670640421403, + "grad_norm": 15.14905071258545, + "learning_rate": 1.8429220779814654e-06, + "loss": 2.1345925331115723, + "refine_loss": 0.0, + "step": 726 + }, + { + "ce_loss": 0.6338975429534912, + "epoch": 0.2687367156455041, + "grad_norm": 10.706128120422363, + "learning_rate": 1.8303812811356503e-06, + "loss": 2.535590171813965, + "refine_loss": 0.0, + "step": 727 + }, + { + "ce_loss": 0.3598291873931885, + "epoch": 0.26910636724886794, + "grad_norm": 9.815231323242188, + "learning_rate": 1.81787373209036e-06, + "loss": 1.439316749572754, + "refine_loss": 0.0, + "step": 728 + }, + { + "ce_loss": 0.5984668731689453, + "epoch": 0.26947601885223177, + "grad_norm": 13.148323059082031, + "learning_rate": 1.8053995620438625e-06, + "loss": 2.3938674926757812, + "refine_loss": 0.0, + "step": 729 + }, + { + "ce_loss": 0.5021162033081055, + "epoch": 0.2698456704555956, + "grad_norm": 11.848552703857422, + "learning_rate": 1.7929589018443016e-06, + "loss": 2.008464813232422, + "refine_loss": 0.0, + "step": 730 + }, + { + "ce_loss": 0.6029839515686035, + "epoch": 0.27021532205895943, + "grad_norm": 10.663994789123535, + "learning_rate": 1.7805518819883134e-06, + "loss": 2.411935806274414, + "refine_loss": 0.0, + "step": 731 + }, + { + "ce_loss": 0.5689539909362793, + "epoch": 0.27058497366232326, + "grad_norm": 10.419864654541016, + "learning_rate": 1.7681786326196665e-06, + "loss": 2.275815963745117, + "refine_loss": 0.0, + "step": 732 + }, + { + "ce_loss": 0.5448682308197021, + "epoch": 0.2709546252656871, + "grad_norm": 11.738164901733398, + "learning_rate": 1.755839283527893e-06, + "loss": 2.1794729232788086, + "refine_loss": 0.0, + "step": 733 + }, + { + "ce_loss": 0.4663722515106201, + "epoch": 0.2713242768690509, + "grad_norm": 13.051131248474121, + "learning_rate": 1.743533964146924e-06, + "loss": 1.8654890060424805, + "refine_loss": 0.0, + "step": 734 + }, + { + "ce_loss": 0.5959153175354004, + "epoch": 0.27169392847241475, + "grad_norm": 9.978246688842773, + "learning_rate": 1.7312628035537388e-06, + "loss": 2.3836612701416016, + "refine_loss": 0.0, + "step": 735 + }, + { + "ce_loss": 0.3628206253051758, + "epoch": 0.2720635800757786, + "grad_norm": 17.45003318786621, + "learning_rate": 1.7190259304670038e-06, + "loss": 1.4512825012207031, + "refine_loss": 0.0, + "step": 736 + }, + { + "ce_loss": 0.5268288850784302, + "epoch": 0.2724332316791424, + "grad_norm": 10.603691101074219, + "learning_rate": 1.706823473245729e-06, + "loss": 2.1073155403137207, + "refine_loss": 0.0, + "step": 737 + }, + { + "ce_loss": 0.5412100553512573, + "epoch": 0.27280288328250624, + "grad_norm": 11.311071395874023, + "learning_rate": 1.6946555598879138e-06, + "loss": 2.1648402214050293, + "refine_loss": 0.0, + "step": 738 + }, + { + "ce_loss": 0.494016170501709, + "epoch": 0.27317253488587007, + "grad_norm": 10.382637023925781, + "learning_rate": 1.6825223180292138e-06, + "loss": 1.976064682006836, + "refine_loss": 0.0, + "step": 739 + }, + { + "ce_loss": 0.5019626617431641, + "epoch": 0.2735421864892339, + "grad_norm": 12.261758804321289, + "learning_rate": 1.6704238749415958e-06, + "loss": 2.0078506469726562, + "refine_loss": 0.0, + "step": 740 + }, + { + "ce_loss": 0.6648910045623779, + "epoch": 0.2739118380925977, + "grad_norm": 16.344078063964844, + "learning_rate": 1.6583603575320002e-06, + "loss": 2.6595640182495117, + "refine_loss": 0.0, + "step": 741 + }, + { + "ce_loss": 0.3459756374359131, + "epoch": 0.27428148969596156, + "grad_norm": 10.523884773254395, + "learning_rate": 1.6463318923410183e-06, + "loss": 1.3839025497436523, + "refine_loss": 0.0, + "step": 742 + }, + { + "ce_loss": 0.5223546028137207, + "epoch": 0.2746511412993254, + "grad_norm": 11.893763542175293, + "learning_rate": 1.6343386055415545e-06, + "loss": 2.089418411254883, + "refine_loss": 0.0, + "step": 743 + }, + { + "ce_loss": 0.45090436935424805, + "epoch": 0.2750207929026892, + "grad_norm": 10.212662696838379, + "learning_rate": 1.6223806229375182e-06, + "loss": 1.8036174774169922, + "refine_loss": 0.0, + "step": 744 + }, + { + "ce_loss": 0.5495195388793945, + "epoch": 0.27539044450605304, + "grad_norm": 15.449052810668945, + "learning_rate": 1.6104580699624839e-06, + "loss": 2.198078155517578, + "refine_loss": 0.0, + "step": 745 + }, + { + "ce_loss": 0.5001084804534912, + "epoch": 0.2757600961094169, + "grad_norm": 10.78960132598877, + "learning_rate": 1.5985710716783936e-06, + "loss": 2.000433921813965, + "refine_loss": 0.0, + "step": 746 + }, + { + "ce_loss": 0.5877435207366943, + "epoch": 0.2761297477127807, + "grad_norm": 21.055932998657227, + "learning_rate": 1.5867197527742312e-06, + "loss": 2.3509740829467773, + "refine_loss": 0.0, + "step": 747 + }, + { + "ce_loss": 0.41708993911743164, + "epoch": 0.27649939931614453, + "grad_norm": 11.033345222473145, + "learning_rate": 1.5749042375647261e-06, + "loss": 1.6683597564697266, + "refine_loss": 0.0, + "step": 748 + }, + { + "ce_loss": 0.4394409656524658, + "epoch": 0.27686905091950836, + "grad_norm": 12.334632873535156, + "learning_rate": 1.563124649989043e-06, + "loss": 1.7577638626098633, + "refine_loss": 0.0, + "step": 749 + }, + { + "ce_loss": 0.5330979824066162, + "epoch": 0.2772387025228722, + "grad_norm": 9.937726020812988, + "learning_rate": 1.5513811136094786e-06, + "loss": 2.132391929626465, + "refine_loss": 0.0, + "step": 750 + }, + { + "ce_loss": 0.6190929412841797, + "epoch": 0.277608354126236, + "grad_norm": 11.392047882080078, + "learning_rate": 1.5396737516101757e-06, + "loss": 2.4763717651367188, + "refine_loss": 0.0, + "step": 751 + }, + { + "ce_loss": 0.45821094512939453, + "epoch": 0.27797800572959985, + "grad_norm": 11.36151123046875, + "learning_rate": 1.5280026867958186e-06, + "loss": 1.8328437805175781, + "refine_loss": 0.0, + "step": 752 + }, + { + "ce_loss": 0.5146242380142212, + "epoch": 0.2783476573329637, + "grad_norm": 9.97701358795166, + "learning_rate": 1.516368041590358e-06, + "loss": 2.0584969520568848, + "refine_loss": 0.0, + "step": 753 + }, + { + "ce_loss": 0.4514446258544922, + "epoch": 0.2787173089363275, + "grad_norm": 10.359207153320312, + "learning_rate": 1.5047699380357134e-06, + "loss": 1.8057785034179688, + "refine_loss": 0.0, + "step": 754 + }, + { + "ce_loss": 0.46965813636779785, + "epoch": 0.27908696053969134, + "grad_norm": 10.557652473449707, + "learning_rate": 1.4932084977905043e-06, + "loss": 1.8786325454711914, + "refine_loss": 0.0, + "step": 755 + }, + { + "ce_loss": 0.5621048212051392, + "epoch": 0.27945661214305517, + "grad_norm": 8.77821159362793, + "learning_rate": 1.4816838421287693e-06, + "loss": 2.2484192848205566, + "refine_loss": 0.0, + "step": 756 + }, + { + "ce_loss": 0.5371860861778259, + "epoch": 0.279826263746419, + "grad_norm": 13.360777854919434, + "learning_rate": 1.470196091938691e-06, + "loss": 2.1487443447113037, + "refine_loss": 0.0, + "step": 757 + }, + { + "ce_loss": 0.5159921646118164, + "epoch": 0.28019591534978283, + "grad_norm": 10.614058494567871, + "learning_rate": 1.4587453677213348e-06, + "loss": 2.0639686584472656, + "refine_loss": 0.0, + "step": 758 + }, + { + "ce_loss": 0.5738165378570557, + "epoch": 0.28056556695314666, + "grad_norm": 13.710598945617676, + "learning_rate": 1.4473317895893773e-06, + "loss": 2.2952661514282227, + "refine_loss": 0.0, + "step": 759 + }, + { + "ce_loss": 0.5323021411895752, + "epoch": 0.2809352185565105, + "grad_norm": 9.153010368347168, + "learning_rate": 1.4359554772658551e-06, + "loss": 2.129208564758301, + "refine_loss": 0.0, + "step": 760 + }, + { + "ce_loss": 0.5512847900390625, + "epoch": 0.2813048701598743, + "grad_norm": 10.505056381225586, + "learning_rate": 1.4246165500828974e-06, + "loss": 2.20513916015625, + "refine_loss": 0.0, + "step": 761 + }, + { + "ce_loss": 0.6141297817230225, + "epoch": 0.28167452176323815, + "grad_norm": 11.103848457336426, + "learning_rate": 1.4133151269804873e-06, + "loss": 2.45651912689209, + "refine_loss": 0.0, + "step": 762 + }, + { + "ce_loss": 0.504880428314209, + "epoch": 0.282044173366602, + "grad_norm": 11.943163871765137, + "learning_rate": 1.4020513265052072e-06, + "loss": 2.019521713256836, + "refine_loss": 0.0, + "step": 763 + }, + { + "ce_loss": 0.607855498790741, + "epoch": 0.2824138249699658, + "grad_norm": 14.647708892822266, + "learning_rate": 1.39082526680899e-06, + "loss": 2.431421995162964, + "refine_loss": 0.0, + "step": 764 + }, + { + "ce_loss": 0.6223640441894531, + "epoch": 0.28278347657332964, + "grad_norm": 14.308052062988281, + "learning_rate": 1.3796370656478936e-06, + "loss": 2.4894561767578125, + "refine_loss": 0.0, + "step": 765 + }, + { + "ce_loss": 0.43649744987487793, + "epoch": 0.28315312817669347, + "grad_norm": 12.026082992553711, + "learning_rate": 1.368486840380851e-06, + "loss": 1.7459897994995117, + "refine_loss": 0.0, + "step": 766 + }, + { + "ce_loss": 0.44762372970581055, + "epoch": 0.2835227797800573, + "grad_norm": 10.865455627441406, + "learning_rate": 1.357374707968452e-06, + "loss": 1.7904949188232422, + "refine_loss": 0.0, + "step": 767 + }, + { + "ce_loss": 0.5169000625610352, + "epoch": 0.28389243138342113, + "grad_norm": 11.241767883300781, + "learning_rate": 1.3463007849717035e-06, + "loss": 2.0676002502441406, + "refine_loss": 0.0, + "step": 768 + }, + { + "ce_loss": 0.6327986717224121, + "epoch": 0.28426208298678496, + "grad_norm": 11.504475593566895, + "learning_rate": 1.3352651875508204e-06, + "loss": 2.5311946868896484, + "refine_loss": 0.0, + "step": 769 + }, + { + "ce_loss": 0.4663664400577545, + "epoch": 0.2846317345901488, + "grad_norm": 13.32528305053711, + "learning_rate": 1.3242680314639995e-06, + "loss": 1.865465760231018, + "refine_loss": 0.0, + "step": 770 + }, + { + "ce_loss": 0.5479562282562256, + "epoch": 0.2850013861935126, + "grad_norm": 12.748851776123047, + "learning_rate": 1.3133094320662e-06, + "loss": 2.1918249130249023, + "refine_loss": 0.0, + "step": 771 + }, + { + "ce_loss": 0.4509480893611908, + "epoch": 0.28537103779687645, + "grad_norm": 11.680636405944824, + "learning_rate": 1.3023895043079476e-06, + "loss": 1.8037923574447632, + "refine_loss": 0.0, + "step": 772 + }, + { + "ce_loss": 0.5514822006225586, + "epoch": 0.2857406894002403, + "grad_norm": 12.159968376159668, + "learning_rate": 1.291508362734113e-06, + "loss": 2.2059288024902344, + "refine_loss": 0.0, + "step": 773 + }, + { + "ce_loss": 0.5850896835327148, + "epoch": 0.2861103410036041, + "grad_norm": 14.672621726989746, + "learning_rate": 1.2806661214827286e-06, + "loss": 2.3403587341308594, + "refine_loss": 0.0, + "step": 774 + }, + { + "ce_loss": 0.4676554203033447, + "epoch": 0.28647999260696794, + "grad_norm": 12.670482635498047, + "learning_rate": 1.2698628942837698e-06, + "loss": 1.870621681213379, + "refine_loss": 0.0, + "step": 775 + }, + { + "ce_loss": 0.4849480390548706, + "epoch": 0.28684964421033177, + "grad_norm": 20.24614906311035, + "learning_rate": 1.2590987944579808e-06, + "loss": 1.9397921562194824, + "refine_loss": 0.0, + "step": 776 + }, + { + "ce_loss": 0.5108424425125122, + "epoch": 0.2872192958136956, + "grad_norm": 10.214526176452637, + "learning_rate": 1.2483739349156726e-06, + "loss": 2.043369770050049, + "refine_loss": 0.0, + "step": 777 + }, + { + "ce_loss": 0.4373484253883362, + "epoch": 0.2875889474170594, + "grad_norm": 14.966462135314941, + "learning_rate": 1.2376884281555485e-06, + "loss": 1.7493937015533447, + "refine_loss": 0.0, + "step": 778 + }, + { + "ce_loss": 0.5580272674560547, + "epoch": 0.28795859902042326, + "grad_norm": 12.958050727844238, + "learning_rate": 1.2270423862635188e-06, + "loss": 2.2321090698242188, + "refine_loss": 0.0, + "step": 779 + }, + { + "ce_loss": 0.5203151702880859, + "epoch": 0.2883282506237871, + "grad_norm": 17.826290130615234, + "learning_rate": 1.2164359209115235e-06, + "loss": 2.0812606811523438, + "refine_loss": 0.0, + "step": 780 + }, + { + "ce_loss": 0.6261954307556152, + "epoch": 0.2886979022271509, + "grad_norm": 15.008233070373535, + "learning_rate": 1.2058691433563675e-06, + "loss": 2.504781723022461, + "refine_loss": 0.0, + "step": 781 + }, + { + "ce_loss": 0.5368080139160156, + "epoch": 0.28906755383051475, + "grad_norm": 12.931550025939941, + "learning_rate": 1.1953421644385444e-06, + "loss": 2.1472320556640625, + "refine_loss": 0.0, + "step": 782 + }, + { + "ce_loss": 0.5071260929107666, + "epoch": 0.2894372054338786, + "grad_norm": 11.08073902130127, + "learning_rate": 1.184855094581085e-06, + "loss": 2.0285043716430664, + "refine_loss": 0.0, + "step": 783 + }, + { + "ce_loss": 0.4824800491333008, + "epoch": 0.2898068570372424, + "grad_norm": 13.488687515258789, + "learning_rate": 1.1744080437883859e-06, + "loss": 1.9299201965332031, + "refine_loss": 0.0, + "step": 784 + }, + { + "ce_loss": 0.5582003593444824, + "epoch": 0.29017650864060623, + "grad_norm": 11.446063995361328, + "learning_rate": 1.164001121645069e-06, + "loss": 2.2328014373779297, + "refine_loss": 0.0, + "step": 785 + }, + { + "ce_loss": 0.40076613426208496, + "epoch": 0.29054616024397006, + "grad_norm": 14.727218627929688, + "learning_rate": 1.1536344373148245e-06, + "loss": 1.6030645370483398, + "refine_loss": 0.0, + "step": 786 + }, + { + "ce_loss": 0.6473727226257324, + "epoch": 0.2909158118473339, + "grad_norm": 11.634893417358398, + "learning_rate": 1.1433080995392614e-06, + "loss": 2.5894908905029297, + "refine_loss": 0.0, + "step": 787 + }, + { + "ce_loss": 0.5038809776306152, + "epoch": 0.2912854634506977, + "grad_norm": 11.105321884155273, + "learning_rate": 1.133022216636781e-06, + "loss": 2.015523910522461, + "refine_loss": 0.0, + "step": 788 + }, + { + "ce_loss": 0.5091217756271362, + "epoch": 0.29165511505406155, + "grad_norm": 11.140454292297363, + "learning_rate": 1.1227768965014246e-06, + "loss": 2.036487102508545, + "refine_loss": 0.0, + "step": 789 + }, + { + "ce_loss": 0.43505859375, + "epoch": 0.2920247666574254, + "grad_norm": 13.135550498962402, + "learning_rate": 1.1125722466017547e-06, + "loss": 1.740234375, + "refine_loss": 0.0, + "step": 790 + }, + { + "ce_loss": 0.5850045680999756, + "epoch": 0.2923944182607892, + "grad_norm": 10.865703582763672, + "learning_rate": 1.102408373979717e-06, + "loss": 2.3400182723999023, + "refine_loss": 0.0, + "step": 791 + }, + { + "ce_loss": 0.46677350997924805, + "epoch": 0.29276406986415304, + "grad_norm": 11.012785911560059, + "learning_rate": 1.092285385249528e-06, + "loss": 1.8670940399169922, + "refine_loss": 0.0, + "step": 792 + }, + { + "ce_loss": 0.690381646156311, + "epoch": 0.2931337214675169, + "grad_norm": 18.860504150390625, + "learning_rate": 1.0822033865965503e-06, + "loss": 2.761526584625244, + "refine_loss": 0.0, + "step": 793 + }, + { + "ce_loss": 0.5481233596801758, + "epoch": 0.2935033730708807, + "grad_norm": 16.09911346435547, + "learning_rate": 1.0721624837761768e-06, + "loss": 2.192493438720703, + "refine_loss": 0.0, + "step": 794 + }, + { + "ce_loss": 0.7393922805786133, + "epoch": 0.29387302467424453, + "grad_norm": 15.772576332092285, + "learning_rate": 1.062162782112729e-06, + "loss": 2.957569122314453, + "refine_loss": 0.0, + "step": 795 + }, + { + "ce_loss": 0.5205652713775635, + "epoch": 0.29424267627760836, + "grad_norm": 20.624980926513672, + "learning_rate": 1.0522043864983428e-06, + "loss": 2.082261085510254, + "refine_loss": 0.0, + "step": 796 + }, + { + "ce_loss": 0.5293693542480469, + "epoch": 0.2946123278809722, + "grad_norm": 11.043195724487305, + "learning_rate": 1.0422874013918793e-06, + "loss": 2.1174774169921875, + "refine_loss": 0.0, + "step": 797 + }, + { + "ce_loss": 0.40993690490722656, + "epoch": 0.294981979484336, + "grad_norm": 9.588510513305664, + "learning_rate": 1.0324119308178166e-06, + "loss": 1.6397476196289062, + "refine_loss": 0.0, + "step": 798 + }, + { + "ce_loss": 0.62449049949646, + "epoch": 0.29535163108769985, + "grad_norm": 10.796364784240723, + "learning_rate": 1.0225780783651689e-06, + "loss": 2.49796199798584, + "refine_loss": 0.0, + "step": 799 + }, + { + "ce_loss": 0.5220094323158264, + "epoch": 0.2957212826910637, + "grad_norm": 10.725756645202637, + "learning_rate": 1.012785947186397e-06, + "loss": 2.0880377292633057, + "refine_loss": 0.0, + "step": 800 + }, + { + "ce_loss": 0.3758988380432129, + "epoch": 0.2960909342944275, + "grad_norm": 9.616456985473633, + "learning_rate": 1.0030356399963204e-06, + "loss": 1.5035953521728516, + "refine_loss": 0.0, + "step": 801 + }, + { + "ce_loss": 0.6117897033691406, + "epoch": 0.29646058589779134, + "grad_norm": 11.851527214050293, + "learning_rate": 9.933272590710508e-07, + "loss": 2.4471588134765625, + "refine_loss": 0.0, + "step": 802 + }, + { + "ce_loss": 0.6554871201515198, + "epoch": 0.29683023750115517, + "grad_norm": 12.550518989562988, + "learning_rate": 9.836609062469066e-07, + "loss": 2.621948480606079, + "refine_loss": 0.0, + "step": 803 + }, + { + "ce_loss": 0.6446542739868164, + "epoch": 0.297199889104519, + "grad_norm": 12.259547233581543, + "learning_rate": 9.740366829193587e-07, + "loss": 2.5786170959472656, + "refine_loss": 0.0, + "step": 804 + }, + { + "ce_loss": 0.6199502944946289, + "epoch": 0.29756954070788283, + "grad_norm": 11.627620697021484, + "learning_rate": 9.644546900419533e-07, + "loss": 2.4798011779785156, + "refine_loss": 0.0, + "step": 805 + }, + { + "ce_loss": 0.4842696189880371, + "epoch": 0.29793919231124666, + "grad_norm": 11.172536849975586, + "learning_rate": 9.549150281252633e-07, + "loss": 1.9370784759521484, + "refine_loss": 0.0, + "step": 806 + }, + { + "ce_loss": 0.5029268264770508, + "epoch": 0.2983088439146105, + "grad_norm": 11.134127616882324, + "learning_rate": 9.454177972358258e-07, + "loss": 2.011707305908203, + "refine_loss": 0.0, + "step": 807 + }, + { + "ce_loss": 0.5627362728118896, + "epoch": 0.2986784955179743, + "grad_norm": 12.98119831085205, + "learning_rate": 9.359630969951012e-07, + "loss": 2.2509450912475586, + "refine_loss": 0.0, + "step": 808 + }, + { + "ce_loss": 0.597536563873291, + "epoch": 0.29904814712133815, + "grad_norm": 13.014914512634277, + "learning_rate": 9.265510265784189e-07, + "loss": 2.390146255493164, + "refine_loss": 0.0, + "step": 809 + }, + { + "ce_loss": 0.5109608173370361, + "epoch": 0.299417798724702, + "grad_norm": 12.061590194702148, + "learning_rate": 9.171816847139447e-07, + "loss": 2.0438432693481445, + "refine_loss": 0.0, + "step": 810 + }, + { + "ce_loss": 0.5583462715148926, + "epoch": 0.2997874503280658, + "grad_norm": 10.799525260925293, + "learning_rate": 9.078551696816434e-07, + "loss": 2.2333850860595703, + "refine_loss": 0.0, + "step": 811 + }, + { + "ce_loss": 0.6024312973022461, + "epoch": 0.30015710193142964, + "grad_norm": 10.332197189331055, + "learning_rate": 8.985715793122407e-07, + "loss": 2.4097251892089844, + "refine_loss": 0.0, + "step": 812 + }, + { + "ce_loss": 0.6148918867111206, + "epoch": 0.30052675353479347, + "grad_norm": 12.872210502624512, + "learning_rate": 8.893310109862102e-07, + "loss": 2.4595675468444824, + "refine_loss": 0.0, + "step": 813 + }, + { + "ce_loss": 0.48483943939208984, + "epoch": 0.3008964051381573, + "grad_norm": 10.203714370727539, + "learning_rate": 8.801335616327378e-07, + "loss": 1.9393577575683594, + "refine_loss": 0.0, + "step": 814 + }, + { + "ce_loss": 0.6568851470947266, + "epoch": 0.3012660567415211, + "grad_norm": 13.283143997192383, + "learning_rate": 8.709793277287182e-07, + "loss": 2.6275405883789062, + "refine_loss": 0.0, + "step": 815 + }, + { + "ce_loss": 0.5225791931152344, + "epoch": 0.30163570834488496, + "grad_norm": 14.360607147216797, + "learning_rate": 8.618684052977305e-07, + "loss": 2.0903167724609375, + "refine_loss": 0.0, + "step": 816 + }, + { + "ce_loss": 0.6814401149749756, + "epoch": 0.3020053599482488, + "grad_norm": 14.771556854248047, + "learning_rate": 8.528008899090412e-07, + "loss": 2.7257604598999023, + "refine_loss": 0.0, + "step": 817 + }, + { + "ce_loss": 0.5417075157165527, + "epoch": 0.3023750115516126, + "grad_norm": 11.415335655212402, + "learning_rate": 8.437768766765975e-07, + "loss": 2.166830062866211, + "refine_loss": 0.0, + "step": 818 + }, + { + "ce_loss": 0.5189037322998047, + "epoch": 0.30274466315497645, + "grad_norm": 10.6888427734375, + "learning_rate": 8.347964602580245e-07, + "loss": 2.0756149291992188, + "refine_loss": 0.0, + "step": 819 + }, + { + "ce_loss": 0.5266027450561523, + "epoch": 0.3031143147583403, + "grad_norm": 11.35065746307373, + "learning_rate": 8.258597348536452e-07, + "loss": 2.1064109802246094, + "refine_loss": 0.0, + "step": 820 + }, + { + "ce_loss": 0.627051591873169, + "epoch": 0.3034839663617041, + "grad_norm": 10.9017333984375, + "learning_rate": 8.16966794205476e-07, + "loss": 2.508206367492676, + "refine_loss": 0.0, + "step": 821 + }, + { + "ce_loss": 0.4798703193664551, + "epoch": 0.30385361796506793, + "grad_norm": 13.04900074005127, + "learning_rate": 8.081177315962601e-07, + "loss": 1.9194812774658203, + "refine_loss": 0.0, + "step": 822 + }, + { + "ce_loss": 0.5484380722045898, + "epoch": 0.30422326956843176, + "grad_norm": 11.526810646057129, + "learning_rate": 7.993126398484741e-07, + "loss": 2.1937522888183594, + "refine_loss": 0.0, + "step": 823 + }, + { + "ce_loss": 0.4842212200164795, + "epoch": 0.3045929211717956, + "grad_norm": 13.79810905456543, + "learning_rate": 7.905516113233652e-07, + "loss": 1.936884880065918, + "refine_loss": 0.0, + "step": 824 + }, + { + "ce_loss": 0.4612412452697754, + "epoch": 0.3049625727751594, + "grad_norm": 16.749582290649414, + "learning_rate": 7.818347379199781e-07, + "loss": 1.8449649810791016, + "refine_loss": 0.0, + "step": 825 + }, + { + "ce_loss": 0.4196434020996094, + "epoch": 0.30533222437852325, + "grad_norm": 9.30785846710205, + "learning_rate": 7.731621110741871e-07, + "loss": 1.6785736083984375, + "refine_loss": 0.0, + "step": 826 + }, + { + "ce_loss": 0.5549745559692383, + "epoch": 0.3057018759818871, + "grad_norm": 14.977422714233398, + "learning_rate": 7.645338217577474e-07, + "loss": 2.219898223876953, + "refine_loss": 0.0, + "step": 827 + }, + { + "ce_loss": 0.47655820846557617, + "epoch": 0.3060715275852509, + "grad_norm": 11.516161918640137, + "learning_rate": 7.55949960477328e-07, + "loss": 1.9062328338623047, + "refine_loss": 0.0, + "step": 828 + }, + { + "ce_loss": 0.5957193374633789, + "epoch": 0.30644117918861474, + "grad_norm": 11.31722354888916, + "learning_rate": 7.474106172735746e-07, + "loss": 2.3828773498535156, + "refine_loss": 0.0, + "step": 829 + }, + { + "ce_loss": 0.4734663963317871, + "epoch": 0.3068108307919786, + "grad_norm": 10.413247108459473, + "learning_rate": 7.389158817201541e-07, + "loss": 1.8938655853271484, + "refine_loss": 0.0, + "step": 830 + }, + { + "ce_loss": 0.5822370052337646, + "epoch": 0.3071804823953424, + "grad_norm": 11.867500305175781, + "learning_rate": 7.304658429228245e-07, + "loss": 2.3289480209350586, + "refine_loss": 0.0, + "step": 831 + }, + { + "ce_loss": 0.6943693161010742, + "epoch": 0.30755013399870623, + "grad_norm": 17.84710121154785, + "learning_rate": 7.220605895184946e-07, + "loss": 2.777477264404297, + "refine_loss": 0.0, + "step": 832 + }, + { + "ce_loss": 0.5503215789794922, + "epoch": 0.30791978560207006, + "grad_norm": 11.138740539550781, + "learning_rate": 7.13700209674294e-07, + "loss": 2.2012863159179688, + "refine_loss": 0.0, + "step": 833 + }, + { + "ce_loss": 0.6137699484825134, + "epoch": 0.3082894372054339, + "grad_norm": 14.548842430114746, + "learning_rate": 7.053847910866513e-07, + "loss": 2.4550797939300537, + "refine_loss": 0.0, + "step": 834 + }, + { + "ce_loss": 0.493664026260376, + "epoch": 0.3086590888087977, + "grad_norm": 15.09384822845459, + "learning_rate": 6.971144209803738e-07, + "loss": 1.974656105041504, + "refine_loss": 0.0, + "step": 835 + }, + { + "ce_loss": 0.48051023483276367, + "epoch": 0.30902874041216155, + "grad_norm": 11.184260368347168, + "learning_rate": 6.888891861077301e-07, + "loss": 1.9220409393310547, + "refine_loss": 0.0, + "step": 836 + }, + { + "ce_loss": 0.5180845260620117, + "epoch": 0.3093983920155254, + "grad_norm": 14.186049461364746, + "learning_rate": 6.807091727475412e-07, + "loss": 2.072338104248047, + "refine_loss": 0.0, + "step": 837 + }, + { + "ce_loss": 0.5101375579833984, + "epoch": 0.3097680436188892, + "grad_norm": 11.600339889526367, + "learning_rate": 6.725744667042778e-07, + "loss": 2.0405502319335938, + "refine_loss": 0.0, + "step": 838 + }, + { + "ce_loss": 0.5894982814788818, + "epoch": 0.31013769522225304, + "grad_norm": 11.818429946899414, + "learning_rate": 6.644851533071556e-07, + "loss": 2.3579931259155273, + "refine_loss": 0.0, + "step": 839 + }, + { + "ce_loss": 0.52535480260849, + "epoch": 0.31050734682561687, + "grad_norm": 14.094864845275879, + "learning_rate": 6.564413174092443e-07, + "loss": 2.10141921043396, + "refine_loss": 0.0, + "step": 840 + }, + { + "ce_loss": 0.5426522493362427, + "epoch": 0.3108769984289807, + "grad_norm": 12.866199493408203, + "learning_rate": 6.484430433865785e-07, + "loss": 2.1706089973449707, + "refine_loss": 0.0, + "step": 841 + }, + { + "ce_loss": 0.6054232120513916, + "epoch": 0.31124665003234453, + "grad_norm": 10.934690475463867, + "learning_rate": 6.404904151372649e-07, + "loss": 2.4216928482055664, + "refine_loss": 0.0, + "step": 842 + }, + { + "ce_loss": 0.6082439422607422, + "epoch": 0.31161630163570836, + "grad_norm": 10.365323066711426, + "learning_rate": 6.325835160806132e-07, + "loss": 2.4329757690429688, + "refine_loss": 0.0, + "step": 843 + }, + { + "ce_loss": 0.3877139091491699, + "epoch": 0.3119859532390722, + "grad_norm": 10.361323356628418, + "learning_rate": 6.24722429156251e-07, + "loss": 1.5508556365966797, + "refine_loss": 0.0, + "step": 844 + }, + { + "ce_loss": 0.5194351077079773, + "epoch": 0.312355604842436, + "grad_norm": 12.066929817199707, + "learning_rate": 6.16907236823262e-07, + "loss": 2.077740430831909, + "refine_loss": 0.0, + "step": 845 + }, + { + "ce_loss": 0.6072402000427246, + "epoch": 0.31272525644579985, + "grad_norm": 11.460638046264648, + "learning_rate": 6.091380210593145e-07, + "loss": 2.4289608001708984, + "refine_loss": 0.0, + "step": 846 + }, + { + "ce_loss": 0.5939306020736694, + "epoch": 0.3130949080491637, + "grad_norm": 16.3962459564209, + "learning_rate": 6.014148633598055e-07, + "loss": 2.3757224082946777, + "refine_loss": 0.0, + "step": 847 + }, + { + "ce_loss": 0.5240622758865356, + "epoch": 0.3134645596525275, + "grad_norm": 12.676226615905762, + "learning_rate": 5.937378447370068e-07, + "loss": 2.0962491035461426, + "refine_loss": 0.0, + "step": 848 + }, + { + "ce_loss": 0.5542137622833252, + "epoch": 0.31383421125589134, + "grad_norm": 12.516949653625488, + "learning_rate": 5.861070457192081e-07, + "loss": 2.216855049133301, + "refine_loss": 0.0, + "step": 849 + }, + { + "ce_loss": 0.5623779296875, + "epoch": 0.31420386285925517, + "grad_norm": 10.861198425292969, + "learning_rate": 5.785225463498828e-07, + "loss": 2.24951171875, + "refine_loss": 0.0, + "step": 850 + }, + { + "ce_loss": 0.4837503433227539, + "epoch": 0.314573514462619, + "grad_norm": 9.769064903259277, + "learning_rate": 5.709844261868381e-07, + "loss": 1.9350013732910156, + "refine_loss": 0.0, + "step": 851 + }, + { + "ce_loss": 0.5295228958129883, + "epoch": 0.3149431660659828, + "grad_norm": 10.997794151306152, + "learning_rate": 5.634927643013899e-07, + "loss": 2.118091583251953, + "refine_loss": 0.0, + "step": 852 + }, + { + "ce_loss": 0.5784568786621094, + "epoch": 0.31531281766934666, + "grad_norm": 15.276631355285645, + "learning_rate": 5.560476392775239e-07, + "loss": 2.3138275146484375, + "refine_loss": 0.0, + "step": 853 + }, + { + "ce_loss": 0.40522223711013794, + "epoch": 0.3156824692727105, + "grad_norm": 11.019256591796875, + "learning_rate": 5.486491292110796e-07, + "loss": 1.6208889484405518, + "refine_loss": 0.0, + "step": 854 + }, + { + "ce_loss": 0.5638465881347656, + "epoch": 0.3160521208760743, + "grad_norm": 10.970101356506348, + "learning_rate": 5.412973117089288e-07, + "loss": 2.2553863525390625, + "refine_loss": 0.0, + "step": 855 + }, + { + "ce_loss": 0.5387318134307861, + "epoch": 0.31642177247943815, + "grad_norm": 13.4029541015625, + "learning_rate": 5.339922638881545e-07, + "loss": 2.1549272537231445, + "refine_loss": 0.0, + "step": 856 + }, + { + "ce_loss": 0.5652360916137695, + "epoch": 0.316791424082802, + "grad_norm": 13.54471492767334, + "learning_rate": 5.267340623752554e-07, + "loss": 2.260944366455078, + "refine_loss": 0.0, + "step": 857 + }, + { + "ce_loss": 0.5265438556671143, + "epoch": 0.3171610756861658, + "grad_norm": 12.33569049835205, + "learning_rate": 5.195227833053273e-07, + "loss": 2.106175422668457, + "refine_loss": 0.0, + "step": 858 + }, + { + "ce_loss": 0.6165313720703125, + "epoch": 0.31753072728952964, + "grad_norm": 13.093805313110352, + "learning_rate": 5.123585023212785e-07, + "loss": 2.46612548828125, + "refine_loss": 0.0, + "step": 859 + }, + { + "ce_loss": 0.5988349914550781, + "epoch": 0.31790037889289346, + "grad_norm": 13.105881690979004, + "learning_rate": 5.05241294573024e-07, + "loss": 2.3953399658203125, + "refine_loss": 0.0, + "step": 860 + }, + { + "ce_loss": 0.5752487182617188, + "epoch": 0.3182700304962573, + "grad_norm": 13.954177856445312, + "learning_rate": 4.981712347167061e-07, + "loss": 2.300994873046875, + "refine_loss": 0.0, + "step": 861 + }, + { + "ce_loss": 0.46507906913757324, + "epoch": 0.3186396820996211, + "grad_norm": 9.474684715270996, + "learning_rate": 4.911483969139086e-07, + "loss": 1.860316276550293, + "refine_loss": 0.0, + "step": 862 + }, + { + "ce_loss": 0.6140155792236328, + "epoch": 0.31900933370298495, + "grad_norm": 11.393430709838867, + "learning_rate": 4.841728548308744e-07, + "loss": 2.4560623168945312, + "refine_loss": 0.0, + "step": 863 + }, + { + "ce_loss": 0.4910316467285156, + "epoch": 0.3193789853063488, + "grad_norm": 10.711258888244629, + "learning_rate": 4.772446816377408e-07, + "loss": 1.9641265869140625, + "refine_loss": 0.0, + "step": 864 + }, + { + "ce_loss": 0.638153076171875, + "epoch": 0.3197486369097126, + "grad_norm": 19.952075958251953, + "learning_rate": 4.7036395000776556e-07, + "loss": 2.5526123046875, + "refine_loss": 0.0, + "step": 865 + }, + { + "ce_loss": 0.5287799835205078, + "epoch": 0.32011828851307644, + "grad_norm": 12.692007064819336, + "learning_rate": 4.6353073211656886e-07, + "loss": 2.1151199340820312, + "refine_loss": 0.0, + "step": 866 + }, + { + "ce_loss": 0.6020537614822388, + "epoch": 0.3204879401164403, + "grad_norm": 13.311338424682617, + "learning_rate": 4.5674509964137136e-07, + "loss": 2.408215045928955, + "refine_loss": 0.0, + "step": 867 + }, + { + "ce_loss": 0.6297923922538757, + "epoch": 0.3208575917198041, + "grad_norm": 13.09640121459961, + "learning_rate": 4.5000712376024826e-07, + "loss": 2.519169569015503, + "refine_loss": 0.0, + "step": 868 + }, + { + "ce_loss": 0.515760064125061, + "epoch": 0.32122724332316793, + "grad_norm": 11.322277069091797, + "learning_rate": 4.4331687515137614e-07, + "loss": 2.063040256500244, + "refine_loss": 0.0, + "step": 869 + }, + { + "ce_loss": 0.48736572265625, + "epoch": 0.32159689492653176, + "grad_norm": 10.963517189025879, + "learning_rate": 4.3667442399229985e-07, + "loss": 1.949462890625, + "refine_loss": 0.0, + "step": 870 + }, + { + "ce_loss": 0.6139624118804932, + "epoch": 0.3219665465298956, + "grad_norm": 11.560728073120117, + "learning_rate": 4.30079839959186e-07, + "loss": 2.4558496475219727, + "refine_loss": 0.0, + "step": 871 + }, + { + "ce_loss": 0.5202231407165527, + "epoch": 0.3223361981332594, + "grad_norm": 13.655935287475586, + "learning_rate": 4.2353319222610265e-07, + "loss": 2.080892562866211, + "refine_loss": 0.0, + "step": 872 + }, + { + "ce_loss": 0.6372876167297363, + "epoch": 0.32270584973662325, + "grad_norm": 12.013011932373047, + "learning_rate": 4.1703454946428635e-07, + "loss": 2.5491504669189453, + "refine_loss": 0.0, + "step": 873 + }, + { + "ce_loss": 0.7090787887573242, + "epoch": 0.3230755013399871, + "grad_norm": 15.784218788146973, + "learning_rate": 4.1058397984142405e-07, + "loss": 2.836315155029297, + "refine_loss": 0.0, + "step": 874 + }, + { + "ce_loss": 0.6786842346191406, + "epoch": 0.3234451529433509, + "grad_norm": 15.491901397705078, + "learning_rate": 4.041815510209396e-07, + "loss": 2.7147369384765625, + "refine_loss": 0.0, + "step": 875 + }, + { + "ce_loss": 0.547095775604248, + "epoch": 0.32381480454671474, + "grad_norm": 12.721159934997559, + "learning_rate": 3.9782733016128006e-07, + "loss": 2.188383102416992, + "refine_loss": 0.0, + "step": 876 + }, + { + "ce_loss": 0.5848770141601562, + "epoch": 0.32418445615007857, + "grad_norm": 11.792797088623047, + "learning_rate": 3.9152138391521766e-07, + "loss": 2.339508056640625, + "refine_loss": 0.0, + "step": 877 + }, + { + "ce_loss": 0.5794501304626465, + "epoch": 0.3245541077534424, + "grad_norm": 11.984404563903809, + "learning_rate": 3.852637784291424e-07, + "loss": 2.317800521850586, + "refine_loss": 0.0, + "step": 878 + }, + { + "ce_loss": 0.54901123046875, + "epoch": 0.32492375935680623, + "grad_norm": 13.713623046875, + "learning_rate": 3.790545793423761e-07, + "loss": 2.196044921875, + "refine_loss": 0.0, + "step": 879 + }, + { + "ce_loss": 0.6358761787414551, + "epoch": 0.32529341096017006, + "grad_norm": 12.058842658996582, + "learning_rate": 3.728938517864794e-07, + "loss": 2.5435047149658203, + "refine_loss": 0.0, + "step": 880 + }, + { + "ce_loss": 0.5443658828735352, + "epoch": 0.3256630625635339, + "grad_norm": 12.601243019104004, + "learning_rate": 3.667816603845681e-07, + "loss": 2.1774635314941406, + "refine_loss": 0.0, + "step": 881 + }, + { + "ce_loss": 0.5234699249267578, + "epoch": 0.3260327141668977, + "grad_norm": 10.138381958007812, + "learning_rate": 3.60718069250639e-07, + "loss": 2.0938796997070312, + "refine_loss": 0.0, + "step": 882 + }, + { + "ce_loss": 0.5154368281364441, + "epoch": 0.32640236577026155, + "grad_norm": 11.418164253234863, + "learning_rate": 3.547031419888919e-07, + "loss": 2.0617473125457764, + "refine_loss": 0.0, + "step": 883 + }, + { + "ce_loss": 0.5620346069335938, + "epoch": 0.3267720173736254, + "grad_norm": 10.558209419250488, + "learning_rate": 3.4873694169306915e-07, + "loss": 2.248138427734375, + "refine_loss": 0.0, + "step": 884 + }, + { + "ce_loss": 0.6364059448242188, + "epoch": 0.3271416689769892, + "grad_norm": 11.820669174194336, + "learning_rate": 3.4281953094578877e-07, + "loss": 2.545623779296875, + "refine_loss": 0.0, + "step": 885 + }, + { + "ce_loss": 0.6794815063476562, + "epoch": 0.32751132058035304, + "grad_norm": 15.534622192382812, + "learning_rate": 3.369509718178887e-07, + "loss": 2.717926025390625, + "refine_loss": 0.0, + "step": 886 + }, + { + "ce_loss": 0.5086033344268799, + "epoch": 0.32788097218371687, + "grad_norm": 10.680244445800781, + "learning_rate": 3.3113132586777786e-07, + "loss": 2.0344133377075195, + "refine_loss": 0.0, + "step": 887 + }, + { + "ce_loss": 0.5310554504394531, + "epoch": 0.3282506237870807, + "grad_norm": 12.769584655761719, + "learning_rate": 3.2536065414078724e-07, + "loss": 2.1242218017578125, + "refine_loss": 0.0, + "step": 888 + }, + { + "ce_loss": 0.5896167755126953, + "epoch": 0.3286202753904445, + "grad_norm": 14.286514282226562, + "learning_rate": 3.196390171685343e-07, + "loss": 2.3584671020507812, + "refine_loss": 0.0, + "step": 889 + }, + { + "ce_loss": 0.5105276107788086, + "epoch": 0.32898992699380836, + "grad_norm": 13.012909889221191, + "learning_rate": 3.1396647496828245e-07, + "loss": 2.0421104431152344, + "refine_loss": 0.0, + "step": 890 + }, + { + "ce_loss": 0.6964492797851562, + "epoch": 0.3293595785971722, + "grad_norm": 12.75920295715332, + "learning_rate": 3.0834308704231485e-07, + "loss": 2.785797119140625, + "refine_loss": 0.0, + "step": 891 + }, + { + "ce_loss": 0.42442476749420166, + "epoch": 0.329729230200536, + "grad_norm": 10.83887004852295, + "learning_rate": 3.0276891237731085e-07, + "loss": 1.6976990699768066, + "refine_loss": 0.0, + "step": 892 + }, + { + "ce_loss": 0.663848876953125, + "epoch": 0.33009888180389985, + "grad_norm": 11.957953453063965, + "learning_rate": 2.97244009443724e-07, + "loss": 2.6553955078125, + "refine_loss": 0.0, + "step": 893 + }, + { + "ce_loss": 0.5286145210266113, + "epoch": 0.3304685334072637, + "grad_norm": 19.558382034301758, + "learning_rate": 2.917684361951728e-07, + "loss": 2.1144580841064453, + "refine_loss": 0.0, + "step": 894 + }, + { + "ce_loss": 0.605743408203125, + "epoch": 0.3308381850106275, + "grad_norm": 11.474892616271973, + "learning_rate": 2.8634225006782867e-07, + "loss": 2.4229736328125, + "refine_loss": 0.0, + "step": 895 + }, + { + "ce_loss": 0.47480297088623047, + "epoch": 0.33120783661399134, + "grad_norm": 12.118206977844238, + "learning_rate": 2.809655079798179e-07, + "loss": 1.8992118835449219, + "refine_loss": 0.0, + "step": 896 + }, + { + "ce_loss": 0.6329402923583984, + "epoch": 0.33157748821735517, + "grad_norm": 13.21630573272705, + "learning_rate": 2.75638266330619e-07, + "loss": 2.5317611694335938, + "refine_loss": 0.0, + "step": 897 + }, + { + "ce_loss": 0.52659010887146, + "epoch": 0.331947139820719, + "grad_norm": 15.188313484191895, + "learning_rate": 2.7036058100047723e-07, + "loss": 2.10636043548584, + "refine_loss": 0.0, + "step": 898 + }, + { + "ce_loss": 0.6456542015075684, + "epoch": 0.3323167914240828, + "grad_norm": 19.598548889160156, + "learning_rate": 2.65132507349814e-07, + "loss": 2.5826168060302734, + "refine_loss": 0.0, + "step": 899 + }, + { + "ce_loss": 0.5238099098205566, + "epoch": 0.33268644302744665, + "grad_norm": 10.917774200439453, + "learning_rate": 2.599541002186479e-07, + "loss": 2.0952396392822266, + "refine_loss": 0.0, + "step": 900 + }, + { + "ce_loss": 0.5356693267822266, + "epoch": 0.3330560946308105, + "grad_norm": 15.470126152038574, + "learning_rate": 2.5482541392601924e-07, + "loss": 2.1426773071289062, + "refine_loss": 0.0, + "step": 901 + }, + { + "ce_loss": 0.6122074127197266, + "epoch": 0.3334257462341743, + "grad_norm": 11.08836841583252, + "learning_rate": 2.497465022694207e-07, + "loss": 2.4488296508789062, + "refine_loss": 0.0, + "step": 902 + }, + { + "ce_loss": 0.58880615234375, + "epoch": 0.33379539783753814, + "grad_norm": 24.284381866455078, + "learning_rate": 2.447174185242324e-07, + "loss": 2.355224609375, + "refine_loss": 0.0, + "step": 903 + }, + { + "ce_loss": 0.6379722356796265, + "epoch": 0.334165049440902, + "grad_norm": 10.364398956298828, + "learning_rate": 2.397382154431621e-07, + "loss": 2.551888942718506, + "refine_loss": 0.0, + "step": 904 + }, + { + "ce_loss": 0.5614376068115234, + "epoch": 0.3345347010442658, + "grad_norm": 12.168800354003906, + "learning_rate": 2.3480894525569564e-07, + "loss": 2.2457504272460938, + "refine_loss": 0.0, + "step": 905 + }, + { + "ce_loss": 0.6385574340820312, + "epoch": 0.33490435264762963, + "grad_norm": 12.758026123046875, + "learning_rate": 2.2992965966754378e-07, + "loss": 2.554229736328125, + "refine_loss": 0.0, + "step": 906 + }, + { + "ce_loss": 0.43265199661254883, + "epoch": 0.33527400425099346, + "grad_norm": 11.041328430175781, + "learning_rate": 2.251004098601034e-07, + "loss": 1.7306079864501953, + "refine_loss": 0.0, + "step": 907 + }, + { + "ce_loss": 0.6389098167419434, + "epoch": 0.3356436558543573, + "grad_norm": 14.209795951843262, + "learning_rate": 2.2032124648992015e-07, + "loss": 2.5556392669677734, + "refine_loss": 0.0, + "step": 908 + }, + { + "ce_loss": 0.6411924362182617, + "epoch": 0.3360133074577211, + "grad_norm": 13.244110107421875, + "learning_rate": 2.1559221968815547e-07, + "loss": 2.564769744873047, + "refine_loss": 0.0, + "step": 909 + }, + { + "ce_loss": 0.5545706748962402, + "epoch": 0.33638295906108495, + "grad_norm": 10.952569007873535, + "learning_rate": 2.109133790600648e-07, + "loss": 2.218282699584961, + "refine_loss": 0.0, + "step": 910 + }, + { + "ce_loss": 0.6292123794555664, + "epoch": 0.3367526106644488, + "grad_norm": 13.721379280090332, + "learning_rate": 2.062847736844703e-07, + "loss": 2.5168495178222656, + "refine_loss": 0.0, + "step": 911 + }, + { + "ce_loss": 0.5003075003623962, + "epoch": 0.3371222622678126, + "grad_norm": 10.782014846801758, + "learning_rate": 2.0170645211325335e-07, + "loss": 2.001230001449585, + "refine_loss": 0.0, + "step": 912 + }, + { + "ce_loss": 0.56982421875, + "epoch": 0.33749191387117644, + "grad_norm": 13.543312072753906, + "learning_rate": 1.9717846237084005e-07, + "loss": 2.279296875, + "refine_loss": 0.0, + "step": 913 + }, + { + "ce_loss": 0.5316064357757568, + "epoch": 0.33786156547454027, + "grad_norm": 11.71592903137207, + "learning_rate": 1.9270085195370048e-07, + "loss": 2.1264257431030273, + "refine_loss": 0.0, + "step": 914 + }, + { + "ce_loss": 0.5829057693481445, + "epoch": 0.3382312170779041, + "grad_norm": 14.641136169433594, + "learning_rate": 1.8827366782984913e-07, + "loss": 2.331623077392578, + "refine_loss": 0.0, + "step": 915 + }, + { + "ce_loss": 0.658900260925293, + "epoch": 0.33860086868126793, + "grad_norm": 18.183748245239258, + "learning_rate": 1.838969564383525e-07, + "loss": 2.635601043701172, + "refine_loss": 0.0, + "step": 916 + }, + { + "ce_loss": 0.6091384887695312, + "epoch": 0.33897052028463176, + "grad_norm": 14.12417984008789, + "learning_rate": 1.7957076368884274e-07, + "loss": 2.436553955078125, + "refine_loss": 0.0, + "step": 917 + }, + { + "ce_loss": 0.4583473205566406, + "epoch": 0.3393401718879956, + "grad_norm": 12.558837890625, + "learning_rate": 1.7529513496103322e-07, + "loss": 1.8333892822265625, + "refine_loss": 0.0, + "step": 918 + }, + { + "ce_loss": 0.6492862701416016, + "epoch": 0.3397098234913594, + "grad_norm": 12.802929878234863, + "learning_rate": 1.7107011510424766e-07, + "loss": 2.5971450805664062, + "refine_loss": 0.0, + "step": 919 + }, + { + "ce_loss": 0.5062646865844727, + "epoch": 0.34007947509472325, + "grad_norm": 14.494813919067383, + "learning_rate": 1.6689574843694433e-07, + "loss": 2.0250587463378906, + "refine_loss": 0.0, + "step": 920 + }, + { + "ce_loss": 0.8153018951416016, + "epoch": 0.3404491266980871, + "grad_norm": 14.68864917755127, + "learning_rate": 1.6277207874625444e-07, + "loss": 3.2612075805664062, + "refine_loss": 0.0, + "step": 921 + }, + { + "ce_loss": 0.5692024230957031, + "epoch": 0.3408187783014509, + "grad_norm": 14.737730026245117, + "learning_rate": 1.5869914928752117e-07, + "loss": 2.2768096923828125, + "refine_loss": 0.0, + "step": 922 + }, + { + "ce_loss": 0.6219048500061035, + "epoch": 0.34118842990481474, + "grad_norm": 13.755121231079102, + "learning_rate": 1.546770027838479e-07, + "loss": 2.487619400024414, + "refine_loss": 0.0, + "step": 923 + }, + { + "ce_loss": 0.6337080001831055, + "epoch": 0.34155808150817857, + "grad_norm": 15.2078275680542, + "learning_rate": 1.5070568142564912e-07, + "loss": 2.534832000732422, + "refine_loss": 0.0, + "step": 924 + }, + { + "ce_loss": 0.6485633850097656, + "epoch": 0.3419277331115424, + "grad_norm": 19.067825317382812, + "learning_rate": 1.4678522687020414e-07, + "loss": 2.5942535400390625, + "refine_loss": 0.0, + "step": 925 + }, + { + "ce_loss": 0.603485107421875, + "epoch": 0.3422973847149062, + "grad_norm": 10.100275993347168, + "learning_rate": 1.4291568024122848e-07, + "loss": 2.4139404296875, + "refine_loss": 0.0, + "step": 926 + }, + { + "ce_loss": 0.6357574462890625, + "epoch": 0.34266703631827006, + "grad_norm": 14.526355743408203, + "learning_rate": 1.390970821284343e-07, + "loss": 2.54302978515625, + "refine_loss": 0.0, + "step": 927 + }, + { + "ce_loss": 0.5712575912475586, + "epoch": 0.3430366879216339, + "grad_norm": 12.46221923828125, + "learning_rate": 1.3532947258710905e-07, + "loss": 2.2850303649902344, + "refine_loss": 0.0, + "step": 928 + }, + { + "ce_loss": 0.5754714012145996, + "epoch": 0.3434063395249977, + "grad_norm": 14.730500221252441, + "learning_rate": 1.3161289113769405e-07, + "loss": 2.3018856048583984, + "refine_loss": 0.0, + "step": 929 + }, + { + "ce_loss": 0.7294296026229858, + "epoch": 0.34377599112836155, + "grad_norm": 13.687814712524414, + "learning_rate": 1.2794737676536993e-07, + "loss": 2.9177184104919434, + "refine_loss": 0.0, + "step": 930 + }, + { + "ce_loss": 0.645714282989502, + "epoch": 0.3441456427317254, + "grad_norm": 16.900068283081055, + "learning_rate": 1.2433296791964754e-07, + "loss": 2.582857131958008, + "refine_loss": 0.0, + "step": 931 + }, + { + "ce_loss": 0.5273637771606445, + "epoch": 0.3445152943350892, + "grad_norm": 15.606173515319824, + "learning_rate": 1.2076970251396593e-07, + "loss": 2.109455108642578, + "refine_loss": 0.0, + "step": 932 + }, + { + "ce_loss": 0.7490692138671875, + "epoch": 0.344884945938453, + "grad_norm": 16.67793846130371, + "learning_rate": 1.1725761792529378e-07, + "loss": 2.99627685546875, + "refine_loss": 0.0, + "step": 933 + }, + { + "ce_loss": 0.5859098434448242, + "epoch": 0.3452545975418168, + "grad_norm": 16.49915885925293, + "learning_rate": 1.1379675099373489e-07, + "loss": 2.343639373779297, + "refine_loss": 0.0, + "step": 934 + }, + { + "ce_loss": 0.6564898490905762, + "epoch": 0.34562424914518064, + "grad_norm": 13.992547035217285, + "learning_rate": 1.1038713802214718e-07, + "loss": 2.6259593963623047, + "refine_loss": 0.0, + "step": 935 + }, + { + "ce_loss": 0.5625448226928711, + "epoch": 0.34599390074854447, + "grad_norm": 13.52691650390625, + "learning_rate": 1.0702881477575589e-07, + "loss": 2.2501792907714844, + "refine_loss": 0.0, + "step": 936 + }, + { + "ce_loss": 0.6467852592468262, + "epoch": 0.3463635523519083, + "grad_norm": 16.410654067993164, + "learning_rate": 1.0372181648178436e-07, + "loss": 2.5871410369873047, + "refine_loss": 0.0, + "step": 937 + }, + { + "ce_loss": 0.5897586345672607, + "epoch": 0.34673320395527213, + "grad_norm": 13.22428035736084, + "learning_rate": 1.004661778290783e-07, + "loss": 2.359034538269043, + "refine_loss": 0.0, + "step": 938 + }, + { + "ce_loss": 0.6049609184265137, + "epoch": 0.34710285555863596, + "grad_norm": 11.715444564819336, + "learning_rate": 9.726193296774767e-08, + "loss": 2.4198436737060547, + "refine_loss": 0.0, + "step": 939 + }, + { + "ce_loss": 0.6639366149902344, + "epoch": 0.3474725071619998, + "grad_norm": 18.374814987182617, + "learning_rate": 9.410911550880474e-08, + "loss": 2.6557464599609375, + "refine_loss": 0.0, + "step": 940 + }, + { + "ce_loss": 0.6365656852722168, + "epoch": 0.3478421587653636, + "grad_norm": 17.330833435058594, + "learning_rate": 9.100775852381227e-08, + "loss": 2.546262741088867, + "refine_loss": 0.0, + "step": 941 + }, + { + "ce_loss": 0.4746434688568115, + "epoch": 0.34821181036872745, + "grad_norm": 12.578265190124512, + "learning_rate": 8.795789454453862e-08, + "loss": 1.898573875427246, + "refine_loss": 0.0, + "step": 942 + }, + { + "ce_loss": 0.8602142333984375, + "epoch": 0.3485814619720913, + "grad_norm": 16.09406852722168, + "learning_rate": 8.495955556261204e-08, + "loss": 3.44085693359375, + "refine_loss": 0.0, + "step": 943 + }, + { + "ce_loss": 0.5457043647766113, + "epoch": 0.3489511135754551, + "grad_norm": 13.791275978088379, + "learning_rate": 8.201277302919086e-08, + "loss": 2.1828174591064453, + "refine_loss": 0.0, + "step": 944 + }, + { + "ce_loss": 0.6922988891601562, + "epoch": 0.34932076517881894, + "grad_norm": 12.478495597839355, + "learning_rate": 7.911757785462882e-08, + "loss": 2.769195556640625, + "refine_loss": 0.0, + "step": 945 + }, + { + "ce_loss": 0.6584944725036621, + "epoch": 0.34969041678218277, + "grad_norm": 23.750160217285156, + "learning_rate": 7.627400040815414e-08, + "loss": 2.6339778900146484, + "refine_loss": 0.0, + "step": 946 + }, + { + "ce_loss": 0.7592735290527344, + "epoch": 0.3500600683855466, + "grad_norm": 16.67685317993164, + "learning_rate": 7.34820705175482e-08, + "loss": 3.0370941162109375, + "refine_loss": 0.0, + "step": 947 + }, + { + "ce_loss": 0.5827199220657349, + "epoch": 0.3504297199889104, + "grad_norm": 13.422663688659668, + "learning_rate": 7.074181746883402e-08, + "loss": 2.3308796882629395, + "refine_loss": 0.0, + "step": 948 + }, + { + "ce_loss": 0.7781219482421875, + "epoch": 0.35079937159227426, + "grad_norm": 16.153005599975586, + "learning_rate": 6.805327000596995e-08, + "loss": 3.11248779296875, + "refine_loss": 0.0, + "step": 949 + }, + { + "ce_loss": 0.708221435546875, + "epoch": 0.3511690231956381, + "grad_norm": 11.349632263183594, + "learning_rate": 6.54164563305465e-08, + "loss": 2.8328857421875, + "refine_loss": 0.0, + "step": 950 + }, + { + "ce_loss": 0.7175531387329102, + "epoch": 0.3515386747990019, + "grad_norm": 16.829397201538086, + "learning_rate": 6.283140410149213e-08, + "loss": 2.8702125549316406, + "refine_loss": 0.0, + "step": 951 + }, + { + "ce_loss": 0.6436319351196289, + "epoch": 0.35190832640236575, + "grad_norm": 15.751110076904297, + "learning_rate": 6.029814043478022e-08, + "loss": 2.5745277404785156, + "refine_loss": 0.0, + "step": 952 + }, + { + "ce_loss": 0.6885108947753906, + "epoch": 0.3522779780057296, + "grad_norm": 16.88723373413086, + "learning_rate": 5.781669190314809e-08, + "loss": 2.7540435791015625, + "refine_loss": 0.0, + "step": 953 + }, + { + "ce_loss": 0.6842339038848877, + "epoch": 0.3526476296090934, + "grad_norm": 15.233070373535156, + "learning_rate": 5.538708453581787e-08, + "loss": 2.736935615539551, + "refine_loss": 0.0, + "step": 954 + }, + { + "ce_loss": 0.5628576278686523, + "epoch": 0.35301728121245723, + "grad_norm": 13.658799171447754, + "learning_rate": 5.3009343818219985e-08, + "loss": 2.2514305114746094, + "refine_loss": 0.0, + "step": 955 + }, + { + "ce_loss": 0.6349101066589355, + "epoch": 0.35338693281582106, + "grad_norm": 17.120283126831055, + "learning_rate": 5.068349469173006e-08, + "loss": 2.539640426635742, + "refine_loss": 0.0, + "step": 956 + }, + { + "ce_loss": 0.6681041717529297, + "epoch": 0.3537565844191849, + "grad_norm": 15.423941612243652, + "learning_rate": 4.840956155340415e-08, + "loss": 2.6724166870117188, + "refine_loss": 0.0, + "step": 957 + }, + { + "ce_loss": 0.5989093780517578, + "epoch": 0.3541262360225487, + "grad_norm": 11.866081237792969, + "learning_rate": 4.618756825572612e-08, + "loss": 2.3956375122070312, + "refine_loss": 0.0, + "step": 958 + }, + { + "ce_loss": 0.5964758396148682, + "epoch": 0.35449588762591255, + "grad_norm": 12.166047096252441, + "learning_rate": 4.40175381063529e-08, + "loss": 2.3859033584594727, + "refine_loss": 0.0, + "step": 959 + }, + { + "ce_loss": 0.6818393468856812, + "epoch": 0.3548655392292764, + "grad_norm": 15.963262557983398, + "learning_rate": 4.189949386787462e-08, + "loss": 2.7273573875427246, + "refine_loss": 0.0, + "step": 960 + }, + { + "ce_loss": 0.744720458984375, + "epoch": 0.3552351908326402, + "grad_norm": 14.606215476989746, + "learning_rate": 3.9833457757572636e-08, + "loss": 2.9788818359375, + "refine_loss": 0.0, + "step": 961 + }, + { + "ce_loss": 0.7027091979980469, + "epoch": 0.35560484243600404, + "grad_norm": 15.369075775146484, + "learning_rate": 3.781945144718912e-08, + "loss": 2.8108367919921875, + "refine_loss": 0.0, + "step": 962 + }, + { + "ce_loss": 0.6390142440795898, + "epoch": 0.3559744940393679, + "grad_norm": 14.233858108520508, + "learning_rate": 3.585749606269562e-08, + "loss": 2.5560569763183594, + "refine_loss": 0.0, + "step": 963 + }, + { + "ce_loss": 0.7089780569076538, + "epoch": 0.3563441456427317, + "grad_norm": 14.224140167236328, + "learning_rate": 3.394761218407705e-08, + "loss": 2.8359122276306152, + "refine_loss": 0.0, + "step": 964 + }, + { + "ce_loss": 0.6093820333480835, + "epoch": 0.35671379724609553, + "grad_norm": 14.470436096191406, + "learning_rate": 3.2089819845111946e-08, + "loss": 2.437528133392334, + "refine_loss": 0.0, + "step": 965 + }, + { + "ce_loss": 0.587188720703125, + "epoch": 0.35708344884945936, + "grad_norm": 12.120734214782715, + "learning_rate": 3.0284138533160924e-08, + "loss": 2.3487548828125, + "refine_loss": 0.0, + "step": 966 + }, + { + "ce_loss": 0.7429046630859375, + "epoch": 0.3574531004528232, + "grad_norm": 15.102005004882812, + "learning_rate": 2.8530587188968508e-08, + "loss": 2.97161865234375, + "refine_loss": 0.0, + "step": 967 + }, + { + "ce_loss": 0.6566063165664673, + "epoch": 0.357822752056187, + "grad_norm": 11.920713424682617, + "learning_rate": 2.6829184206457194e-08, + "loss": 2.626425266265869, + "refine_loss": 0.0, + "step": 968 + }, + { + "ce_loss": 0.7405166625976562, + "epoch": 0.35819240365955085, + "grad_norm": 13.676044464111328, + "learning_rate": 2.5179947432540376e-08, + "loss": 2.962066650390625, + "refine_loss": 0.0, + "step": 969 + }, + { + "ce_loss": 0.663238525390625, + "epoch": 0.3585620552629147, + "grad_norm": 15.06004524230957, + "learning_rate": 2.358289416693027e-08, + "loss": 2.6529541015625, + "refine_loss": 0.0, + "step": 970 + }, + { + "ce_loss": 0.8121566772460938, + "epoch": 0.3589317068662785, + "grad_norm": 12.730252265930176, + "learning_rate": 2.2038041161960288e-08, + "loss": 3.248626708984375, + "refine_loss": 0.0, + "step": 971 + }, + { + "ce_loss": 0.6720213890075684, + "epoch": 0.35930135846964234, + "grad_norm": 14.242924690246582, + "learning_rate": 2.0545404622407396e-08, + "loss": 2.6880855560302734, + "refine_loss": 0.0, + "step": 972 + }, + { + "ce_loss": 0.8273372650146484, + "epoch": 0.35967101007300617, + "grad_norm": 13.401178359985352, + "learning_rate": 1.91050002053228e-08, + "loss": 3.3093490600585938, + "refine_loss": 0.0, + "step": 973 + }, + { + "ce_loss": 0.619896650314331, + "epoch": 0.36004066167637, + "grad_norm": 11.59762191772461, + "learning_rate": 1.7716843019867646e-08, + "loss": 2.479586601257324, + "refine_loss": 0.0, + "step": 974 + }, + { + "ce_loss": 0.6526031494140625, + "epoch": 0.36041031327973383, + "grad_norm": 12.659024238586426, + "learning_rate": 1.6380947627153143e-08, + "loss": 2.61041259765625, + "refine_loss": 0.0, + "step": 975 + }, + { + "ce_loss": 0.6439666748046875, + "epoch": 0.36077996488309766, + "grad_norm": 11.873104095458984, + "learning_rate": 1.509732804009012e-08, + "loss": 2.57586669921875, + "refine_loss": 0.0, + "step": 976 + }, + { + "ce_loss": 0.70196533203125, + "epoch": 0.3611496164864615, + "grad_norm": 14.741543769836426, + "learning_rate": 1.386599772324082e-08, + "loss": 2.807861328125, + "refine_loss": 0.0, + "step": 977 + }, + { + "ce_loss": 0.5659351348876953, + "epoch": 0.3615192680898253, + "grad_norm": 13.794413566589355, + "learning_rate": 1.268696959267679e-08, + "loss": 2.2637405395507812, + "refine_loss": 0.0, + "step": 978 + }, + { + "ce_loss": 0.6732769012451172, + "epoch": 0.36188891969318915, + "grad_norm": 13.74492073059082, + "learning_rate": 1.156025601584676e-08, + "loss": 2.6931076049804688, + "refine_loss": 0.0, + "step": 979 + }, + { + "ce_loss": 0.7427010536193848, + "epoch": 0.362258571296553, + "grad_norm": 11.655576705932617, + "learning_rate": 1.0485868811441757e-08, + "loss": 2.970804214477539, + "refine_loss": 0.0, + "step": 980 + }, + { + "ce_loss": 0.6494433879852295, + "epoch": 0.3626282228999168, + "grad_norm": 15.183904647827148, + "learning_rate": 9.463819249275751e-09, + "loss": 2.597773551940918, + "refine_loss": 0.0, + "step": 981 + }, + { + "ce_loss": 0.6905202865600586, + "epoch": 0.36299787450328064, + "grad_norm": 13.394247055053711, + "learning_rate": 8.494118050164646e-09, + "loss": 2.7620811462402344, + "refine_loss": 0.0, + "step": 982 + }, + { + "ce_loss": 0.8233795166015625, + "epoch": 0.36336752610664447, + "grad_norm": 15.451663970947266, + "learning_rate": 7.576775385815249e-09, + "loss": 3.29351806640625, + "refine_loss": 0.0, + "step": 983 + }, + { + "ce_loss": 0.77020263671875, + "epoch": 0.3637371777100083, + "grad_norm": 18.815898895263672, + "learning_rate": 6.711800878718144e-09, + "loss": 3.080810546875, + "refine_loss": 0.0, + "step": 984 + }, + { + "ce_loss": 0.6873394846916199, + "epoch": 0.3641068293133721, + "grad_norm": 15.296095848083496, + "learning_rate": 5.899203602046655e-09, + "loss": 2.7493579387664795, + "refine_loss": 0.0, + "step": 985 + }, + { + "ce_loss": 0.6941566467285156, + "epoch": 0.36447648091673596, + "grad_norm": 14.844564437866211, + "learning_rate": 5.138992079561367e-09, + "loss": 2.7766265869140625, + "refine_loss": 0.0, + "step": 986 + }, + { + "ce_loss": 0.6506303548812866, + "epoch": 0.3648461325200998, + "grad_norm": 13.75194263458252, + "learning_rate": 4.431174285521866e-09, + "loss": 2.6025214195251465, + "refine_loss": 0.0, + "step": 987 + }, + { + "ce_loss": 0.5989456176757812, + "epoch": 0.3652157841234636, + "grad_norm": 14.863990783691406, + "learning_rate": 3.775757644601808e-09, + "loss": 2.395782470703125, + "refine_loss": 0.0, + "step": 988 + }, + { + "ce_loss": 0.728729248046875, + "epoch": 0.36558543572682745, + "grad_norm": 13.172318458557129, + "learning_rate": 3.1727490318111953e-09, + "loss": 2.9149169921875, + "refine_loss": 0.0, + "step": 989 + }, + { + "ce_loss": 0.6702585220336914, + "epoch": 0.3659550873301913, + "grad_norm": 14.55232048034668, + "learning_rate": 2.6221547724253337e-09, + "loss": 2.6810340881347656, + "refine_loss": 0.0, + "step": 990 + }, + { + "ce_loss": 0.7144622802734375, + "epoch": 0.3663247389335551, + "grad_norm": 13.725797653198242, + "learning_rate": 2.1239806419176556e-09, + "loss": 2.85784912109375, + "refine_loss": 0.0, + "step": 991 + }, + { + "ce_loss": 0.7625013589859009, + "epoch": 0.36669439053691893, + "grad_norm": 19.42701530456543, + "learning_rate": 1.6782318658992159e-09, + "loss": 3.0500054359436035, + "refine_loss": 0.0, + "step": 992 + }, + { + "ce_loss": 0.80938720703125, + "epoch": 0.36706404214028276, + "grad_norm": 14.209688186645508, + "learning_rate": 1.2849131200631804e-09, + "loss": 3.237548828125, + "refine_loss": 0.0, + "step": 993 + }, + { + "ce_loss": 0.7321561574935913, + "epoch": 0.3674336937436466, + "grad_norm": 15.454883575439453, + "learning_rate": 9.440285301370865e-10, + "loss": 2.9286246299743652, + "refine_loss": 0.0, + "step": 994 + }, + { + "ce_loss": 0.8015985488891602, + "epoch": 0.3678033453470104, + "grad_norm": 14.914102554321289, + "learning_rate": 6.555816718389896e-10, + "loss": 3.2063941955566406, + "refine_loss": 0.0, + "step": 995 + }, + { + "ce_loss": 0.8189907073974609, + "epoch": 0.36817299695037425, + "grad_norm": 15.88015365600586, + "learning_rate": 4.1957557084082447e-10, + "loss": 3.2759628295898438, + "refine_loss": 0.0, + "step": 996 + }, + { + "ce_loss": 0.7701222896575928, + "epoch": 0.3685426485537381, + "grad_norm": 15.627527236938477, + "learning_rate": 2.360127027339898e-10, + "loss": 3.080489158630371, + "refine_loss": 0.0, + "step": 997 + }, + { + "ce_loss": 0.624833345413208, + "epoch": 0.3689123001571019, + "grad_norm": 10.658343315124512, + "learning_rate": 1.0489499300603279e-10, + "loss": 2.499333381652832, + "refine_loss": 0.0, + "step": 998 + }, + { + "ce_loss": 0.7457962036132812, + "epoch": 0.36928195176046574, + "grad_norm": 19.188032150268555, + "learning_rate": 2.622381702066523e-11, + "loss": 2.983184814453125, + "refine_loss": 0.0, + "step": 999 + }, + { + "ce_loss": 0.6273384094238281, + "epoch": 0.3696516033638296, + "grad_norm": 13.033102989196777, + "learning_rate": 0.0, + "loss": 2.5093536376953125, + "refine_loss": 0.0, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3429315640355717e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}