Upload 5 files

Browse files

Files changed (5) hide show

config.json +51 -0
generation_config.json +7 -0
model.safetensors +3 -0
tokenizer.json +0 -0
trainer_state.json +2338 -0

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "architectures": [
+    "Qwen3_5ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "head_dim": 24,
+  "hidden_act": "silu",
+  "hidden_size": 72,
+  "initializer_range": 0.02,
+  "intermediate_size": 288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "linear_conv_kernel_dim": 4,
+  "linear_key_head_dim": 128,
+  "linear_num_key_heads": 16,
+  "linear_num_value_heads": 32,
+  "linear_value_head_dim": 128,
+  "max_position_embeddings": 384,
+  "model_type": "qwen3_5_text",
+  "num_attention_heads": 3,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 3,
+  "pad_token_id": null,
+  "partial_rotary_factor": 0.25,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "partial_rotary_factor": 0.25,
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.8.1",
+  "use_cache": false,
+  "vocab_size": 3076
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.8.1",
+  "use_cache": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65a889eeea89ea34b0977fb5a56290aefa9d68bb9326fe70de1f584b66095387
+size 5139616

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2338 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7024980118911601,
+  "eval_steps": 3000,
+  "global_step": 89000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002367970826599416,
+      "grad_norm": 0.5120002627372742,
+      "learning_rate": 2.99e-05,
+      "loss": 7.8463818359375,
+      "step": 300
+    },
+    {
+      "epoch": 0.004735941653198832,
+      "grad_norm": 0.5012888312339783,
+      "learning_rate": 5.989999999999999e-05,
+      "loss": 7.265762532552083,
+      "step": 600
+    },
+    {
+      "epoch": 0.007103912479798249,
+      "grad_norm": 0.4929312467575073,
+      "learning_rate": 8.99e-05,
+      "loss": 6.630942789713542,
+      "step": 900
+    },
+    {
+      "epoch": 0.009471883306397665,
+      "grad_norm": 0.5659914612770081,
+      "learning_rate": 0.00011989999999999999,
+      "loss": 6.048028564453125,
+      "step": 1200
+    },
+    {
+      "epoch": 0.01183985413299708,
+      "grad_norm": 1.3028839826583862,
+      "learning_rate": 0.00014989999999999998,
+      "loss": 5.70859375,
+      "step": 1500
+    },
+    {
+      "epoch": 0.014207824959596499,
+      "grad_norm": 1.2629913091659546,
+      "learning_rate": 0.00017989999999999998,
+      "loss": 5.3664111328125,
+      "step": 1800
+    },
+    {
+      "epoch": 0.016575795786195915,
+      "grad_norm": 1.458747148513794,
+      "learning_rate": 0.00020989999999999998,
+      "loss": 5.094122721354167,
+      "step": 2100
+    },
+    {
+      "epoch": 0.01894376661279533,
+      "grad_norm": 2.0311570167541504,
+      "learning_rate": 0.00023989999999999998,
+      "loss": 4.872973225911458,
+      "step": 2400
+    },
+    {
+      "epoch": 0.021311737439394747,
+      "grad_norm": 1.6081920862197876,
+      "learning_rate": 0.00026989999999999995,
+      "loss": 4.6951521809895835,
+      "step": 2700
+    },
+    {
+      "epoch": 0.02367970826599416,
+      "grad_norm": 1.6408644914627075,
+      "learning_rate": 0.00029989999999999997,
+      "loss": 4.552720947265625,
+      "step": 3000
+    },
+    {
+      "epoch": 0.02367970826599416,
+      "eval_loss": 4.49172830581665,
+      "eval_runtime": 805.9737,
+      "eval_samples_per_second": 113.46,
+      "eval_steps_per_second": 14.183,
+      "step": 3000
+    },
+    {
+      "epoch": 0.02604767909259358,
+      "grad_norm": 1.8765032291412354,
+      "learning_rate": 0.0002992748057659813,
+      "loss": 4.435736083984375,
+      "step": 3300
+    },
+    {
+      "epoch": 0.028415649919192997,
+      "grad_norm": 2.237330675125122,
+      "learning_rate": 0.0002985471861331867,
+      "loss": 4.3409326171875,
+      "step": 3600
+    },
+    {
+      "epoch": 0.03078362074579241,
+      "grad_norm": 1.7407817840576172,
+      "learning_rate": 0.00029781956650039206,
+      "loss": 4.262849934895834,
+      "step": 3900
+    },
+    {
+      "epoch": 0.03315159157239183,
+      "grad_norm": 1.8509694337844849,
+      "learning_rate": 0.00029709194686759744,
+      "loss": 4.203232014973958,
+      "step": 4200
+    },
+    {
+      "epoch": 0.03551956239899125,
+      "grad_norm": 1.4811211824417114,
+      "learning_rate": 0.0002963643272348028,
+      "loss": 4.146802978515625,
+      "step": 4500
+    },
+    {
+      "epoch": 0.03788753322559066,
+      "grad_norm": 1.3642165660858154,
+      "learning_rate": 0.00029563670760200824,
+      "loss": 4.1003564453125,
+      "step": 4800
+    },
+    {
+      "epoch": 0.040255504052190076,
+      "grad_norm": 1.6928187608718872,
+      "learning_rate": 0.00029490908796921356,
+      "loss": 4.057576497395833,
+      "step": 5100
+    },
+    {
+      "epoch": 0.042623474878789494,
+      "grad_norm": 1.5884612798690796,
+      "learning_rate": 0.000294181468336419,
+      "loss": 4.0220039876302085,
+      "step": 5400
+    },
+    {
+      "epoch": 0.04499144570538891,
+      "grad_norm": 1.3557602167129517,
+      "learning_rate": 0.0002934538487036243,
+      "loss": 3.987523193359375,
+      "step": 5700
+    },
+    {
+      "epoch": 0.04735941653198832,
+      "grad_norm": 1.5322710275650024,
+      "learning_rate": 0.0002927262290708297,
+      "loss": 3.9578169759114585,
+      "step": 6000
+    },
+    {
+      "epoch": 0.04735941653198832,
+      "eval_loss": 3.9426522254943848,
+      "eval_runtime": 715.9313,
+      "eval_samples_per_second": 127.73,
+      "eval_steps_per_second": 15.967,
+      "step": 6000
+    },
+    {
+      "epoch": 0.04972738735858774,
+      "grad_norm": 1.4013663530349731,
+      "learning_rate": 0.0002919986094380351,
+      "loss": 3.9291121419270834,
+      "step": 6300
+    },
+    {
+      "epoch": 0.05209535818518716,
+      "grad_norm": 1.4654945135116577,
+      "learning_rate": 0.0002912709898052404,
+      "loss": 3.9034086100260414,
+      "step": 6600
+    },
+    {
+      "epoch": 0.054463329011786576,
+      "grad_norm": 1.298611044883728,
+      "learning_rate": 0.00029054337017244585,
+      "loss": 3.8796598307291665,
+      "step": 6900
+    },
+    {
+      "epoch": 0.056831299838385994,
+      "grad_norm": 1.301093578338623,
+      "learning_rate": 0.00028981575053965117,
+      "loss": 3.8580826822916667,
+      "step": 7200
+    },
+    {
+      "epoch": 0.059199270664985405,
+      "grad_norm": 1.5246330499649048,
+      "learning_rate": 0.0002890881309068566,
+      "loss": 3.842342529296875,
+      "step": 7500
+    },
+    {
+      "epoch": 0.06156724149158482,
+      "grad_norm": 1.7583329677581787,
+      "learning_rate": 0.00028836051127406197,
+      "loss": 3.8208109537760415,
+      "step": 7800
+    },
+    {
+      "epoch": 0.06393521231818423,
+      "grad_norm": 1.393927812576294,
+      "learning_rate": 0.00028763289164126734,
+      "loss": 3.8070406087239586,
+      "step": 8100
+    },
+    {
+      "epoch": 0.06630318314478366,
+      "grad_norm": 1.5146640539169312,
+      "learning_rate": 0.0002869052720084727,
+      "loss": 3.790522054036458,
+      "step": 8400
+    },
+    {
+      "epoch": 0.06867115397138307,
+      "grad_norm": 1.4215984344482422,
+      "learning_rate": 0.0002861776523756781,
+      "loss": 3.776350911458333,
+      "step": 8700
+    },
+    {
+      "epoch": 0.0710391247979825,
+      "grad_norm": 1.231087565422058,
+      "learning_rate": 0.00028545003274288346,
+      "loss": 3.7632710774739584,
+      "step": 9000
+    },
+    {
+      "epoch": 0.0710391247979825,
+      "eval_loss": 3.7583487033843994,
+      "eval_runtime": 718.1397,
+      "eval_samples_per_second": 127.337,
+      "eval_steps_per_second": 15.918,
+      "step": 9000
+    },
+    {
+      "epoch": 0.0734070956245819,
+      "grad_norm": 1.6221380233764648,
+      "learning_rate": 0.0002847224131100888,
+      "loss": 3.7516385904947915,
+      "step": 9300
+    },
+    {
+      "epoch": 0.07577506645118132,
+      "grad_norm": 1.1648283004760742,
+      "learning_rate": 0.0002839947934772942,
+      "loss": 3.740259195963542,
+      "step": 9600
+    },
+    {
+      "epoch": 0.07814303727778074,
+      "grad_norm": 1.3073171377182007,
+      "learning_rate": 0.0002832671738444996,
+      "loss": 3.7312325032552085,
+      "step": 9900
+    },
+    {
+      "epoch": 0.08051100810438015,
+      "grad_norm": 1.162903904914856,
+      "learning_rate": 0.00028253955421170496,
+      "loss": 3.7197379557291668,
+      "step": 10200
+    },
+    {
+      "epoch": 0.08287897893097958,
+      "grad_norm": 1.0998663902282715,
+      "learning_rate": 0.00028181193457891033,
+      "loss": 3.7105806477864585,
+      "step": 10500
+    },
+    {
+      "epoch": 0.08524694975757899,
+      "grad_norm": 1.336269736289978,
+      "learning_rate": 0.0002810843149461157,
+      "loss": 3.70353759765625,
+      "step": 10800
+    },
+    {
+      "epoch": 0.0876149205841784,
+      "grad_norm": 1.2086349725723267,
+      "learning_rate": 0.0002803566953133211,
+      "loss": 3.69432861328125,
+      "step": 11100
+    },
+    {
+      "epoch": 0.08998289141077782,
+      "grad_norm": 1.254562497138977,
+      "learning_rate": 0.00027962907568052645,
+      "loss": 3.6888834635416665,
+      "step": 11400
+    },
+    {
+      "epoch": 0.09235086223737723,
+      "grad_norm": 1.2221864461898804,
+      "learning_rate": 0.0002789014560477318,
+      "loss": 3.678084716796875,
+      "step": 11700
+    },
+    {
+      "epoch": 0.09471883306397665,
+      "grad_norm": 1.1677751541137695,
+      "learning_rate": 0.0002781738364149372,
+      "loss": 3.6715397135416667,
+      "step": 12000
+    },
+    {
+      "epoch": 0.09471883306397665,
+      "eval_loss": 3.6702311038970947,
+      "eval_runtime": 715.0294,
+      "eval_samples_per_second": 127.891,
+      "eval_steps_per_second": 15.987,
+      "step": 12000
+    },
+    {
+      "epoch": 0.09708680389057607,
+      "grad_norm": 1.117470145225525,
+      "learning_rate": 0.00027744621678214257,
+      "loss": 3.6664725748697915,
+      "step": 12300
+    },
+    {
+      "epoch": 0.09945477471717548,
+      "grad_norm": 1.2815014123916626,
+      "learning_rate": 0.00027671859714934794,
+      "loss": 3.661252034505208,
+      "step": 12600
+    },
+    {
+      "epoch": 0.1018227455437749,
+      "grad_norm": 1.0856040716171265,
+      "learning_rate": 0.0002759909775165533,
+      "loss": 3.6564200846354167,
+      "step": 12900
+    },
+    {
+      "epoch": 0.10419071637037432,
+      "grad_norm": 1.1769014596939087,
+      "learning_rate": 0.0002752633578837587,
+      "loss": 3.649186197916667,
+      "step": 13200
+    },
+    {
+      "epoch": 0.10655868719697373,
+      "grad_norm": 1.3859938383102417,
+      "learning_rate": 0.00027453573825096406,
+      "loss": 3.643751627604167,
+      "step": 13500
+    },
+    {
+      "epoch": 0.10892665802357315,
+      "grad_norm": 1.8141573667526245,
+      "learning_rate": 0.00027380811861816944,
+      "loss": 3.637841796875,
+      "step": 13800
+    },
+    {
+      "epoch": 0.11129462885017256,
+      "grad_norm": 1.0778776407241821,
+      "learning_rate": 0.0002730804989853748,
+      "loss": 3.6337967936197915,
+      "step": 14100
+    },
+    {
+      "epoch": 0.11366259967677199,
+      "grad_norm": 1.0354928970336914,
+      "learning_rate": 0.0002723528793525802,
+      "loss": 3.6280550130208336,
+      "step": 14400
+    },
+    {
+      "epoch": 0.1160305705033714,
+      "grad_norm": 1.0805619955062866,
+      "learning_rate": 0.00027162525971978556,
+      "loss": 3.624861653645833,
+      "step": 14700
+    },
+    {
+      "epoch": 0.11839854132997081,
+      "grad_norm": 1.2263294458389282,
+      "learning_rate": 0.00027089764008699093,
+      "loss": 3.619776611328125,
+      "step": 15000
+    },
+    {
+      "epoch": 0.11839854132997081,
+      "eval_loss": 3.6203835010528564,
+      "eval_runtime": 715.5873,
+      "eval_samples_per_second": 127.792,
+      "eval_steps_per_second": 15.974,
+      "step": 15000
+    },
+    {
+      "epoch": 0.12076651215657024,
+      "grad_norm": 0.9841662645339966,
+      "learning_rate": 0.00027017002045419636,
+      "loss": 3.615108642578125,
+      "step": 15300
+    },
+    {
+      "epoch": 0.12313448298316965,
+      "grad_norm": 1.1377567052841187,
+      "learning_rate": 0.0002694424008214017,
+      "loss": 3.6123758951822915,
+      "step": 15600
+    },
+    {
+      "epoch": 0.12550245380976907,
+      "grad_norm": 1.1826812028884888,
+      "learning_rate": 0.00026871478118860705,
+      "loss": 3.607783203125,
+      "step": 15900
+    },
+    {
+      "epoch": 0.12787042463636847,
+      "grad_norm": 1.0089523792266846,
+      "learning_rate": 0.0002679871615558124,
+      "loss": 3.605452067057292,
+      "step": 16200
+    },
+    {
+      "epoch": 0.1302383954629679,
+      "grad_norm": 1.0800347328186035,
+      "learning_rate": 0.0002672595419230178,
+      "loss": 3.6014888509114584,
+      "step": 16500
+    },
+    {
+      "epoch": 0.13260636628956732,
+      "grad_norm": 1.3550310134887695,
+      "learning_rate": 0.0002665319222902232,
+      "loss": 3.599439697265625,
+      "step": 16800
+    },
+    {
+      "epoch": 0.13497433711616674,
+      "grad_norm": 1.1879329681396484,
+      "learning_rate": 0.00026580430265742854,
+      "loss": 3.59224609375,
+      "step": 17100
+    },
+    {
+      "epoch": 0.13734230794276614,
+      "grad_norm": 0.9711620211601257,
+      "learning_rate": 0.00026507668302463397,
+      "loss": 3.590773111979167,
+      "step": 17400
+    },
+    {
+      "epoch": 0.13971027876936556,
+      "grad_norm": 1.348230004310608,
+      "learning_rate": 0.0002643490633918393,
+      "loss": 3.586073404947917,
+      "step": 17700
+    },
+    {
+      "epoch": 0.142078249595965,
+      "grad_norm": 1.338919758796692,
+      "learning_rate": 0.0002636214437590447,
+      "loss": 3.5837748209635416,
+      "step": 18000
+    },
+    {
+      "epoch": 0.142078249595965,
+      "eval_loss": 3.585731029510498,
+      "eval_runtime": 716.1146,
+      "eval_samples_per_second": 127.697,
+      "eval_steps_per_second": 15.963,
+      "step": 18000
+    },
+    {
+      "epoch": 0.1444462204225644,
+      "grad_norm": 1.079649806022644,
+      "learning_rate": 0.00026289382412625004,
+      "loss": 3.5820625813802085,
+      "step": 18300
+    },
+    {
+      "epoch": 0.1468141912491638,
+      "grad_norm": 0.9815412759780884,
+      "learning_rate": 0.00026216620449345546,
+      "loss": 3.579681396484375,
+      "step": 18600
+    },
+    {
+      "epoch": 0.14918216207576324,
+      "grad_norm": 0.9551495313644409,
+      "learning_rate": 0.00026143858486066084,
+      "loss": 3.5745731608072915,
+      "step": 18900
+    },
+    {
+      "epoch": 0.15155013290236263,
+      "grad_norm": 1.2238168716430664,
+      "learning_rate": 0.0002607109652278662,
+      "loss": 3.57164306640625,
+      "step": 19200
+    },
+    {
+      "epoch": 0.15391810372896206,
+      "grad_norm": 1.0777201652526855,
+      "learning_rate": 0.0002599833455950716,
+      "loss": 3.570758056640625,
+      "step": 19500
+    },
+    {
+      "epoch": 0.15628607455556148,
+      "grad_norm": 1.0027897357940674,
+      "learning_rate": 0.0002592557259622769,
+      "loss": 3.5670182291666666,
+      "step": 19800
+    },
+    {
+      "epoch": 0.15865404538216088,
+      "grad_norm": 0.969893217086792,
+      "learning_rate": 0.00025852810632948233,
+      "loss": 3.563897705078125,
+      "step": 20100
+    },
+    {
+      "epoch": 0.1610220162087603,
+      "grad_norm": 1.1204485893249512,
+      "learning_rate": 0.0002578004866966877,
+      "loss": 3.5608902994791665,
+      "step": 20400
+    },
+    {
+      "epoch": 0.16338998703535973,
+      "grad_norm": 1.3880882263183594,
+      "learning_rate": 0.0002570728670638931,
+      "loss": 3.559180908203125,
+      "step": 20700
+    },
+    {
+      "epoch": 0.16575795786195915,
+      "grad_norm": 1.1501843929290771,
+      "learning_rate": 0.00025634524743109845,
+      "loss": 3.5571744791666666,
+      "step": 21000
+    },
+    {
+      "epoch": 0.16575795786195915,
+      "eval_loss": 3.557675361633301,
+      "eval_runtime": 717.4951,
+      "eval_samples_per_second": 127.452,
+      "eval_steps_per_second": 15.932,
+      "step": 21000
+    },
+    {
+      "epoch": 0.16812592868855855,
+      "grad_norm": 1.0499974489212036,
+      "learning_rate": 0.0002556176277983038,
+      "loss": 3.556209716796875,
+      "step": 21300
+    },
+    {
+      "epoch": 0.17049389951515798,
+      "grad_norm": 1.0525528192520142,
+      "learning_rate": 0.0002548900081655092,
+      "loss": 3.552860107421875,
+      "step": 21600
+    },
+    {
+      "epoch": 0.1728618703417574,
+      "grad_norm": 1.0933493375778198,
+      "learning_rate": 0.00025416238853271457,
+      "loss": 3.5523689778645835,
+      "step": 21900
+    },
+    {
+      "epoch": 0.1752298411683568,
+      "grad_norm": 1.2025096416473389,
+      "learning_rate": 0.00025343476889991994,
+      "loss": 3.5478466796875,
+      "step": 22200
+    },
+    {
+      "epoch": 0.17759781199495622,
+      "grad_norm": 1.0625479221343994,
+      "learning_rate": 0.0002527071492671253,
+      "loss": 3.5459244791666666,
+      "step": 22500
+    },
+    {
+      "epoch": 0.17996578282155565,
+      "grad_norm": 0.9771813154220581,
+      "learning_rate": 0.0002519795296343307,
+      "loss": 3.5437410481770835,
+      "step": 22800
+    },
+    {
+      "epoch": 0.18233375364815504,
+      "grad_norm": 1.0128353834152222,
+      "learning_rate": 0.00025125191000153606,
+      "loss": 3.5427360026041668,
+      "step": 23100
+    },
+    {
+      "epoch": 0.18470172447475447,
+      "grad_norm": 1.0560641288757324,
+      "learning_rate": 0.00025052429036874144,
+      "loss": 3.5412748209635416,
+      "step": 23400
+    },
+    {
+      "epoch": 0.1870696953013539,
+      "grad_norm": 0.9452905058860779,
+      "learning_rate": 0.0002497966707359468,
+      "loss": 3.5386356608072917,
+      "step": 23700
+    },
+    {
+      "epoch": 0.1894376661279533,
+      "grad_norm": 1.0137649774551392,
+      "learning_rate": 0.0002490690511031522,
+      "loss": 3.53781005859375,
+      "step": 24000
+    },
+    {
+      "epoch": 0.1894376661279533,
+      "eval_loss": 3.536222457885742,
+      "eval_runtime": 796.2907,
+      "eval_samples_per_second": 114.84,
+      "eval_steps_per_second": 14.355,
+      "step": 24000
+    },
+    {
+      "epoch": 0.19180563695455272,
+      "grad_norm": 0.996288537979126,
+      "learning_rate": 0.00024834143147035756,
+      "loss": 3.5338720703125,
+      "step": 24300
+    },
+    {
+      "epoch": 0.19417360778115214,
+      "grad_norm": 1.0632129907608032,
+      "learning_rate": 0.00024761381183756293,
+      "loss": 3.53280517578125,
+      "step": 24600
+    },
+    {
+      "epoch": 0.19654157860775157,
+      "grad_norm": 1.082140564918518,
+      "learning_rate": 0.0002468861922047683,
+      "loss": 3.5305131022135416,
+      "step": 24900
+    },
+    {
+      "epoch": 0.19890954943435096,
+      "grad_norm": 0.9779849648475647,
+      "learning_rate": 0.0002461585725719737,
+      "loss": 3.5306551106770834,
+      "step": 25200
+    },
+    {
+      "epoch": 0.2012775202609504,
+      "grad_norm": 1.0490564107894897,
+      "learning_rate": 0.00024543095293917905,
+      "loss": 3.5283394368489582,
+      "step": 25500
+    },
+    {
+      "epoch": 0.2036454910875498,
+      "grad_norm": 0.9798095226287842,
+      "learning_rate": 0.0002447033333063845,
+      "loss": 3.526612548828125,
+      "step": 25800
+    },
+    {
+      "epoch": 0.2060134619141492,
+      "grad_norm": 1.0423043966293335,
+      "learning_rate": 0.00024397571367358982,
+      "loss": 3.523155517578125,
+      "step": 26100
+    },
+    {
+      "epoch": 0.20838143274074863,
+      "grad_norm": 1.0529029369354248,
+      "learning_rate": 0.00024324809404079517,
+      "loss": 3.523507080078125,
+      "step": 26400
+    },
+    {
+      "epoch": 0.21074940356734806,
+      "grad_norm": 1.0267666578292847,
+      "learning_rate": 0.00024252047440800057,
+      "loss": 3.5207275390625,
+      "step": 26700
+    },
+    {
+      "epoch": 0.21311737439394746,
+      "grad_norm": 1.0359541177749634,
+      "learning_rate": 0.00024179285477520592,
+      "loss": 3.5200341796875,
+      "step": 27000
+    },
+    {
+      "epoch": 0.21311737439394746,
+      "eval_loss": 3.520230770111084,
+      "eval_runtime": 870.6716,
+      "eval_samples_per_second": 105.029,
+      "eval_steps_per_second": 13.129,
+      "step": 27000
+    },
+    {
+      "epoch": 0.21548534522054688,
+      "grad_norm": 1.037380576133728,
+      "learning_rate": 0.00024106523514241132,
+      "loss": 3.5190767415364586,
+      "step": 27300
+    },
+    {
+      "epoch": 0.2178533160471463,
+      "grad_norm": 0.9903006553649902,
+      "learning_rate": 0.00024033761550961666,
+      "loss": 3.5154398600260417,
+      "step": 27600
+    },
+    {
+      "epoch": 0.2202212868737457,
+      "grad_norm": 0.9548389911651611,
+      "learning_rate": 0.00023960999587682206,
+      "loss": 3.514188232421875,
+      "step": 27900
+    },
+    {
+      "epoch": 0.22258925770034513,
+      "grad_norm": 0.9166722297668457,
+      "learning_rate": 0.00023888237624402744,
+      "loss": 3.512381998697917,
+      "step": 28200
+    },
+    {
+      "epoch": 0.22495722852694455,
+      "grad_norm": 1.058884620666504,
+      "learning_rate": 0.0002381547566112328,
+      "loss": 3.51379638671875,
+      "step": 28500
+    },
+    {
+      "epoch": 0.22732519935354398,
+      "grad_norm": 0.9274208545684814,
+      "learning_rate": 0.00023742713697843818,
+      "loss": 3.511065673828125,
+      "step": 28800
+    },
+    {
+      "epoch": 0.22969317018014337,
+      "grad_norm": 0.90445876121521,
+      "learning_rate": 0.00023669951734564358,
+      "loss": 3.508526611328125,
+      "step": 29100
+    },
+    {
+      "epoch": 0.2320611410067428,
+      "grad_norm": 0.9827880859375,
+      "learning_rate": 0.00023597189771284893,
+      "loss": 3.50707275390625,
+      "step": 29400
+    },
+    {
+      "epoch": 0.23442911183334222,
+      "grad_norm": 0.9515128135681152,
+      "learning_rate": 0.0002352442780800543,
+      "loss": 3.507930908203125,
+      "step": 29700
+    },
+    {
+      "epoch": 0.23679708265994162,
+      "grad_norm": 0.9174400568008423,
+      "learning_rate": 0.00023451665844725968,
+      "loss": 3.5041459147135416,
+      "step": 30000
+    },
+    {
+      "epoch": 0.23679708265994162,
+      "eval_loss": 3.5068986415863037,
+      "eval_runtime": 838.5585,
+      "eval_samples_per_second": 109.051,
+      "eval_steps_per_second": 13.632,
+      "step": 30000
+    },
+    {
+      "epoch": 0.23916505348654105,
+      "grad_norm": 1.100203037261963,
+      "learning_rate": 0.00023378903881446505,
+      "loss": 3.5045340983072917,
+      "step": 30300
+    },
+    {
+      "epoch": 0.24153302431314047,
+      "grad_norm": 1.1162805557250977,
+      "learning_rate": 0.00023306141918167045,
+      "loss": 3.5026436360677082,
+      "step": 30600
+    },
+    {
+      "epoch": 0.24390099513973987,
+      "grad_norm": 0.9859351515769958,
+      "learning_rate": 0.0002323337995488758,
+      "loss": 3.4984635416666667,
+      "step": 30900
+    },
+    {
+      "epoch": 0.2462689659663393,
+      "grad_norm": 1.0356827974319458,
+      "learning_rate": 0.0002316061799160812,
+      "loss": 3.49945068359375,
+      "step": 31200
+    },
+    {
+      "epoch": 0.24863693679293872,
+      "grad_norm": 0.9120193123817444,
+      "learning_rate": 0.00023087856028328654,
+      "loss": 3.4988773600260417,
+      "step": 31500
+    },
+    {
+      "epoch": 0.25100490761953814,
+      "grad_norm": 0.9607883095741272,
+      "learning_rate": 0.00023015094065049194,
+      "loss": 3.4984765625,
+      "step": 31800
+    },
+    {
+      "epoch": 0.25337287844613754,
+      "grad_norm": 0.9375991225242615,
+      "learning_rate": 0.00022942332101769732,
+      "loss": 3.4940218098958336,
+      "step": 32100
+    },
+    {
+      "epoch": 0.25574084927273694,
+      "grad_norm": 0.9308803677558899,
+      "learning_rate": 0.0002286957013849027,
+      "loss": 3.4968701171875,
+      "step": 32400
+    },
+    {
+      "epoch": 0.2581088200993364,
+      "grad_norm": 0.9001510143280029,
+      "learning_rate": 0.00022796808175210806,
+      "loss": 3.493405354817708,
+      "step": 32700
+    },
+    {
+      "epoch": 0.2604767909259358,
+      "grad_norm": 0.9320982098579407,
+      "learning_rate": 0.0002272404621193134,
+      "loss": 3.4938972981770835,
+      "step": 33000
+    },
+    {
+      "epoch": 0.2604767909259358,
+      "eval_loss": 3.4941368103027344,
+      "eval_runtime": 780.2849,
+      "eval_samples_per_second": 117.196,
+      "eval_steps_per_second": 14.65,
+      "step": 33000
+    },
+    {
+      "epoch": 0.2628447617525352,
+      "grad_norm": 0.8956986665725708,
+      "learning_rate": 0.0002265128424865188,
+      "loss": 3.4921891276041666,
+      "step": 33300
+    },
+    {
+      "epoch": 0.26521273257913464,
+      "grad_norm": 1.0946784019470215,
+      "learning_rate": 0.00022578522285372416,
+      "loss": 3.4908650716145835,
+      "step": 33600
+    },
+    {
+      "epoch": 0.26758070340573403,
+      "grad_norm": 0.9773956537246704,
+      "learning_rate": 0.00022505760322092956,
+      "loss": 3.490456136067708,
+      "step": 33900
+    },
+    {
+      "epoch": 0.2699486742323335,
+      "grad_norm": 0.9223257899284363,
+      "learning_rate": 0.00022432998358813493,
+      "loss": 3.4880021158854166,
+      "step": 34200
+    },
+    {
+      "epoch": 0.2723166450589329,
+      "grad_norm": 0.968156635761261,
+      "learning_rate": 0.0002236023639553403,
+      "loss": 3.488686116536458,
+      "step": 34500
+    },
+    {
+      "epoch": 0.2746846158855323,
+      "grad_norm": 0.936249315738678,
+      "learning_rate": 0.00022287474432254568,
+      "loss": 3.486342366536458,
+      "step": 34800
+    },
+    {
+      "epoch": 0.27705258671213173,
+      "grad_norm": 0.9775978326797485,
+      "learning_rate": 0.00022214712468975108,
+      "loss": 3.485010986328125,
+      "step": 35100
+    },
+    {
+      "epoch": 0.27942055753873113,
+      "grad_norm": 0.8574739694595337,
+      "learning_rate": 0.00022141950505695642,
+      "loss": 3.485606689453125,
+      "step": 35400
+    },
+    {
+      "epoch": 0.2817885283653305,
+      "grad_norm": 1.0943197011947632,
+      "learning_rate": 0.00022069188542416182,
+      "loss": 3.4831343587239583,
+      "step": 35700
+    },
+    {
+      "epoch": 0.28415649919193,
+      "grad_norm": 1.110596776008606,
+      "learning_rate": 0.00021996426579136717,
+      "loss": 3.482994384765625,
+      "step": 36000
+    },
+    {
+      "epoch": 0.28415649919193,
+      "eval_loss": 3.483867883682251,
+      "eval_runtime": 808.3329,
+      "eval_samples_per_second": 113.129,
+      "eval_steps_per_second": 14.141,
+      "step": 36000
+    },
+    {
+      "epoch": 0.2865244700185294,
+      "grad_norm": 0.9997596740722656,
+      "learning_rate": 0.00021923664615857254,
+      "loss": 3.4822794596354165,
+      "step": 36300
+    },
+    {
+      "epoch": 0.2888924408451288,
+      "grad_norm": 1.0619690418243408,
+      "learning_rate": 0.00021850902652577794,
+      "loss": 3.4802823893229164,
+      "step": 36600
+    },
+    {
+      "epoch": 0.2912604116717282,
+      "grad_norm": 0.8717644214630127,
+      "learning_rate": 0.0002177814068929833,
+      "loss": 3.4801631673177083,
+      "step": 36900
+    },
+    {
+      "epoch": 0.2936283824983276,
+      "grad_norm": 1.0067319869995117,
+      "learning_rate": 0.0002170537872601887,
+      "loss": 3.4796492513020834,
+      "step": 37200
+    },
+    {
+      "epoch": 0.295996353324927,
+      "grad_norm": 0.9514286518096924,
+      "learning_rate": 0.00021632616762739404,
+      "loss": 3.4773624674479167,
+      "step": 37500
+    },
+    {
+      "epoch": 0.29836432415152647,
+      "grad_norm": 0.926723837852478,
+      "learning_rate": 0.00021559854799459944,
+      "loss": 3.478388671875,
+      "step": 37800
+    },
+    {
+      "epoch": 0.30073229497812587,
+      "grad_norm": 0.8748102188110352,
+      "learning_rate": 0.00021487092836180478,
+      "loss": 3.475211181640625,
+      "step": 38100
+    },
+    {
+      "epoch": 0.30310026580472527,
+      "grad_norm": 1.0045989751815796,
+      "learning_rate": 0.00021414330872901018,
+      "loss": 3.473729248046875,
+      "step": 38400
+    },
+    {
+      "epoch": 0.3054682366313247,
+      "grad_norm": 0.8651835918426514,
+      "learning_rate": 0.00021341568909621556,
+      "loss": 3.4740470377604167,
+      "step": 38700
+    },
+    {
+      "epoch": 0.3078362074579241,
+      "grad_norm": 0.8830553889274597,
+      "learning_rate": 0.00021268806946342093,
+      "loss": 3.4751204427083335,
+      "step": 39000
+    },
+    {
+      "epoch": 0.3078362074579241,
+      "eval_loss": 3.4747302532196045,
+      "eval_runtime": 730.2167,
+      "eval_samples_per_second": 125.231,
+      "eval_steps_per_second": 15.654,
+      "step": 39000
+    },
+    {
+      "epoch": 0.3102041782845235,
+      "grad_norm": 0.9464964270591736,
+      "learning_rate": 0.0002119604498306263,
+      "loss": 3.4746061197916664,
+      "step": 39300
+    },
+    {
+      "epoch": 0.31257214911112297,
+      "grad_norm": 0.9324625730514526,
+      "learning_rate": 0.00021123283019783165,
+      "loss": 3.470994059244792,
+      "step": 39600
+    },
+    {
+      "epoch": 0.31494011993772236,
+      "grad_norm": 1.1780554056167603,
+      "learning_rate": 0.00021050521056503705,
+      "loss": 3.4727888997395833,
+      "step": 39900
+    },
+    {
+      "epoch": 0.31730809076432176,
+      "grad_norm": 0.9041585922241211,
+      "learning_rate": 0.00020977759093224242,
+      "loss": 3.468409016927083,
+      "step": 40200
+    },
+    {
+      "epoch": 0.3196760615909212,
+      "grad_norm": 0.9092329144477844,
+      "learning_rate": 0.0002090499712994478,
+      "loss": 3.469168701171875,
+      "step": 40500
+    },
+    {
+      "epoch": 0.3220440324175206,
+      "grad_norm": 0.910569965839386,
+      "learning_rate": 0.00020832235166665317,
+      "loss": 3.4692765299479165,
+      "step": 40800
+    },
+    {
+      "epoch": 0.32441200324412,
+      "grad_norm": 0.9507219791412354,
+      "learning_rate": 0.00020759473203385857,
+      "loss": 3.468035888671875,
+      "step": 41100
+    },
+    {
+      "epoch": 0.32677997407071946,
+      "grad_norm": 0.8926546573638916,
+      "learning_rate": 0.00020686711240106392,
+      "loss": 3.4669425455729166,
+      "step": 41400
+    },
+    {
+      "epoch": 0.32914794489731886,
+      "grad_norm": 0.9421713948249817,
+      "learning_rate": 0.00020613949276826932,
+      "loss": 3.46578369140625,
+      "step": 41700
+    },
+    {
+      "epoch": 0.3315159157239183,
+      "grad_norm": 0.9552275538444519,
+      "learning_rate": 0.00020541187313547466,
+      "loss": 3.465254313151042,
+      "step": 42000
+    },
+    {
+      "epoch": 0.3315159157239183,
+      "eval_loss": 3.4675979614257812,
+      "eval_runtime": 746.4636,
+      "eval_samples_per_second": 122.506,
+      "eval_steps_per_second": 15.314,
+      "step": 42000
+    },
+    {
+      "epoch": 0.3338838865505177,
+      "grad_norm": 0.8721812963485718,
+      "learning_rate": 0.00020468425350268006,
+      "loss": 3.4658219401041666,
+      "step": 42300
+    },
+    {
+      "epoch": 0.3362518573771171,
+      "grad_norm": 0.9490671157836914,
+      "learning_rate": 0.0002039566338698854,
+      "loss": 3.463648681640625,
+      "step": 42600
+    },
+    {
+      "epoch": 0.33861982820371656,
+      "grad_norm": 0.9442757964134216,
+      "learning_rate": 0.00020322901423709078,
+      "loss": 3.463751220703125,
+      "step": 42900
+    },
+    {
+      "epoch": 0.34098779903031595,
+      "grad_norm": 0.9976273775100708,
+      "learning_rate": 0.00020250139460429618,
+      "loss": 3.4618192545572914,
+      "step": 43200
+    },
+    {
+      "epoch": 0.34335576985691535,
+      "grad_norm": 0.9240713715553284,
+      "learning_rate": 0.00020177377497150153,
+      "loss": 3.46244873046875,
+      "step": 43500
+    },
+    {
+      "epoch": 0.3457237406835148,
+      "grad_norm": 0.908048689365387,
+      "learning_rate": 0.00020104615533870693,
+      "loss": 3.461156819661458,
+      "step": 43800
+    },
+    {
+      "epoch": 0.3480917115101142,
+      "grad_norm": 0.9948372840881348,
+      "learning_rate": 0.00020031853570591228,
+      "loss": 3.4602978515625,
+      "step": 44100
+    },
+    {
+      "epoch": 0.3504596823367136,
+      "grad_norm": 0.9128249883651733,
+      "learning_rate": 0.00019959091607311768,
+      "loss": 3.45984130859375,
+      "step": 44400
+    },
+    {
+      "epoch": 0.35282765316331305,
+      "grad_norm": 0.9716851115226746,
+      "learning_rate": 0.00019886329644032305,
+      "loss": 3.4610965983072917,
+      "step": 44700
+    },
+    {
+      "epoch": 0.35519562398991245,
+      "grad_norm": 0.8809486031532288,
+      "learning_rate": 0.00019813567680752842,
+      "loss": 3.459405924479167,
+      "step": 45000
+    },
+    {
+      "epoch": 0.35519562398991245,
+      "eval_loss": 3.459148645401001,
+      "eval_runtime": 746.3606,
+      "eval_samples_per_second": 122.523,
+      "eval_steps_per_second": 15.316,
+      "step": 45000
+    },
+    {
+      "epoch": 0.35756359481651184,
+      "grad_norm": 0.9781611561775208,
+      "learning_rate": 0.0001974080571747338,
+      "loss": 3.457574462890625,
+      "step": 45300
+    },
+    {
+      "epoch": 0.3599315656431113,
+      "grad_norm": 0.9035600423812866,
+      "learning_rate": 0.0001966804375419392,
+      "loss": 3.4560428873697915,
+      "step": 45600
+    },
+    {
+      "epoch": 0.3622995364697107,
+      "grad_norm": 0.9381418228149414,
+      "learning_rate": 0.00019595281790914454,
+      "loss": 3.4561177571614583,
+      "step": 45900
+    },
+    {
+      "epoch": 0.3646675072963101,
+      "grad_norm": 0.895790696144104,
+      "learning_rate": 0.00019522519827634992,
+      "loss": 3.45467041015625,
+      "step": 46200
+    },
+    {
+      "epoch": 0.36703547812290954,
+      "grad_norm": 0.9542234539985657,
+      "learning_rate": 0.0001944975786435553,
+      "loss": 3.4569083658854165,
+      "step": 46500
+    },
+    {
+      "epoch": 0.36940344894950894,
+      "grad_norm": 0.9805415868759155,
+      "learning_rate": 0.00019376995901076066,
+      "loss": 3.455458984375,
+      "step": 46800
+    },
+    {
+      "epoch": 0.37177141977610834,
+      "grad_norm": 0.9199254512786865,
+      "learning_rate": 0.00019304233937796604,
+      "loss": 3.4538167317708335,
+      "step": 47100
+    },
+    {
+      "epoch": 0.3741393906027078,
+      "grad_norm": 1.0180085897445679,
+      "learning_rate": 0.0001923147197451714,
+      "loss": 3.453374430338542,
+      "step": 47400
+    },
+    {
+      "epoch": 0.3765073614293072,
+      "grad_norm": 0.9545879364013672,
+      "learning_rate": 0.0001915871001123768,
+      "loss": 3.4519278971354166,
+      "step": 47700
+    },
+    {
+      "epoch": 0.3788753322559066,
+      "grad_norm": 0.9383215308189392,
+      "learning_rate": 0.00019085948047958216,
+      "loss": 3.4520377604166668,
+      "step": 48000
+    },
+    {
+      "epoch": 0.3788753322559066,
+      "eval_loss": 3.453564405441284,
+      "eval_runtime": 745.4086,
+      "eval_samples_per_second": 122.679,
+      "eval_steps_per_second": 15.335,
+      "step": 48000
+    },
+    {
+      "epoch": 0.38124330308250604,
+      "grad_norm": 0.8732979893684387,
+      "learning_rate": 0.00019013186084678756,
+      "loss": 3.452303873697917,
+      "step": 48300
+    },
+    {
+      "epoch": 0.38361127390910543,
+      "grad_norm": 0.8917658925056458,
+      "learning_rate": 0.0001894042412139929,
+      "loss": 3.45164794921875,
+      "step": 48600
+    },
+    {
+      "epoch": 0.38597924473570483,
+      "grad_norm": 0.8956992626190186,
+      "learning_rate": 0.0001886766215811983,
+      "loss": 3.450028483072917,
+      "step": 48900
+    },
+    {
+      "epoch": 0.3883472155623043,
+      "grad_norm": 1.021035075187683,
+      "learning_rate": 0.00018794900194840368,
+      "loss": 3.450257568359375,
+      "step": 49200
+    },
+    {
+      "epoch": 0.3907151863889037,
+      "grad_norm": 0.9055272936820984,
+      "learning_rate": 0.00018722138231560902,
+      "loss": 3.4502669270833333,
+      "step": 49500
+    },
+    {
+      "epoch": 0.39308315721550313,
+      "grad_norm": 0.9082198739051819,
+      "learning_rate": 0.00018649376268281442,
+      "loss": 3.4501778157552083,
+      "step": 49800
+    },
+    {
+      "epoch": 0.39545112804210253,
+      "grad_norm": 0.9271096587181091,
+      "learning_rate": 0.00018576614305001977,
+      "loss": 3.446944580078125,
+      "step": 50100
+    },
+    {
+      "epoch": 0.3978190988687019,
+      "grad_norm": 0.8688974380493164,
+      "learning_rate": 0.00018503852341722517,
+      "loss": 3.4467333984375,
+      "step": 50400
+    },
+    {
+      "epoch": 0.4001870696953014,
+      "grad_norm": 0.8836665749549866,
+      "learning_rate": 0.00018431090378443054,
+      "loss": 3.4481075032552084,
+      "step": 50700
+    },
+    {
+      "epoch": 0.4025550405219008,
+      "grad_norm": 0.936870276927948,
+      "learning_rate": 0.00018358328415163592,
+      "loss": 3.44533203125,
+      "step": 51000
+    },
+    {
+      "epoch": 0.4025550405219008,
+      "eval_loss": 3.447103261947632,
+      "eval_runtime": 744.2459,
+      "eval_samples_per_second": 122.871,
+      "eval_steps_per_second": 15.359,
+      "step": 51000
+    },
+    {
+      "epoch": 0.4049230113485002,
+      "grad_norm": 0.9273696541786194,
+      "learning_rate": 0.0001828556645188413,
+      "loss": 3.445096435546875,
+      "step": 51300
+    },
+    {
+      "epoch": 0.4072909821750996,
+      "grad_norm": 0.9366033673286438,
+      "learning_rate": 0.00018212804488604666,
+      "loss": 3.4445076497395832,
+      "step": 51600
+    },
+    {
+      "epoch": 0.409658953001699,
+      "grad_norm": 0.9472881555557251,
+      "learning_rate": 0.00018140042525325204,
+      "loss": 3.4447623697916665,
+      "step": 51900
+    },
+    {
+      "epoch": 0.4120269238282984,
+      "grad_norm": 0.8676178455352783,
+      "learning_rate": 0.00018067280562045744,
+      "loss": 3.44419189453125,
+      "step": 52200
+    },
+    {
+      "epoch": 0.41439489465489787,
+      "grad_norm": 0.9277909398078918,
+      "learning_rate": 0.00017994518598766278,
+      "loss": 3.444455159505208,
+      "step": 52500
+    },
+    {
+      "epoch": 0.41676286548149727,
+      "grad_norm": 0.9761715531349182,
+      "learning_rate": 0.00017921756635486816,
+      "loss": 3.4431758626302083,
+      "step": 52800
+    },
+    {
+      "epoch": 0.41913083630809667,
+      "grad_norm": 0.9198261499404907,
+      "learning_rate": 0.00017848994672207353,
+      "loss": 3.4412251790364583,
+      "step": 53100
+    },
+    {
+      "epoch": 0.4214988071346961,
+      "grad_norm": 0.937890350818634,
+      "learning_rate": 0.0001777623270892789,
+      "loss": 3.4420284016927085,
+      "step": 53400
+    },
+    {
+      "epoch": 0.4238667779612955,
+      "grad_norm": 1.0416877269744873,
+      "learning_rate": 0.0001770347074564843,
+      "loss": 3.440599365234375,
+      "step": 53700
+    },
+    {
+      "epoch": 0.4262347487878949,
+      "grad_norm": 0.9447240233421326,
+      "learning_rate": 0.00017630708782368965,
+      "loss": 3.4408907063802086,
+      "step": 54000
+    },
+    {
+      "epoch": 0.4262347487878949,
+      "eval_loss": 3.441429376602173,
+      "eval_runtime": 751.3255,
+      "eval_samples_per_second": 121.713,
+      "eval_steps_per_second": 15.214,
+      "step": 54000
+    },
+    {
+      "epoch": 0.42860271961449437,
+      "grad_norm": 0.9947392344474792,
+      "learning_rate": 0.00017557946819089505,
+      "loss": 3.440968017578125,
+      "step": 54300
+    },
+    {
+      "epoch": 0.43097069044109376,
+      "grad_norm": 0.9691703915596008,
+      "learning_rate": 0.0001748518485581004,
+      "loss": 3.4403125,
+      "step": 54600
+    },
+    {
+      "epoch": 0.43333866126769316,
+      "grad_norm": 0.8677510619163513,
+      "learning_rate": 0.0001741242289253058,
+      "loss": 3.4377701822916666,
+      "step": 54900
+    },
+    {
+      "epoch": 0.4357066320942926,
+      "grad_norm": 1.112776279449463,
+      "learning_rate": 0.00017339660929251117,
+      "loss": 3.438250732421875,
+      "step": 55200
+    },
+    {
+      "epoch": 0.438074602920892,
+      "grad_norm": 0.9026038646697998,
+      "learning_rate": 0.00017266898965971654,
+      "loss": 3.4367390950520833,
+      "step": 55500
+    },
+    {
+      "epoch": 0.4404425737474914,
+      "grad_norm": 0.9615198969841003,
+      "learning_rate": 0.00017194137002692192,
+      "loss": 3.436519775390625,
+      "step": 55800
+    },
+    {
+      "epoch": 0.44281054457409086,
+      "grad_norm": 0.9865854978561401,
+      "learning_rate": 0.0001712137503941273,
+      "loss": 3.4372578938802083,
+      "step": 56100
+    },
+    {
+      "epoch": 0.44517851540069026,
+      "grad_norm": 0.8490434288978577,
+      "learning_rate": 0.00017048613076133266,
+      "loss": 3.4357320149739583,
+      "step": 56400
+    },
+    {
+      "epoch": 0.44754648622728965,
+      "grad_norm": 1.0268882513046265,
+      "learning_rate": 0.000169758511128538,
+      "loss": 3.4376021321614583,
+      "step": 56700
+    },
+    {
+      "epoch": 0.4499144570538891,
+      "grad_norm": 0.8533521294593811,
+      "learning_rate": 0.0001690308914957434,
+      "loss": 3.436923828125,
+      "step": 57000
+    },
+    {
+      "epoch": 0.4499144570538891,
+      "eval_loss": 3.4373722076416016,
+      "eval_runtime": 805.2689,
+      "eval_samples_per_second": 113.56,
+      "eval_steps_per_second": 14.195,
+      "step": 57000
+    },
+    {
+      "epoch": 0.4522824278804885,
+      "grad_norm": 0.9340401291847229,
+      "learning_rate": 0.00016830327186294878,
+      "loss": 3.434686279296875,
+      "step": 57300
+    },
+    {
+      "epoch": 0.45465039870708795,
+      "grad_norm": 0.9090702533721924,
+      "learning_rate": 0.00016757565223015416,
+      "loss": 3.432115478515625,
+      "step": 57600
+    },
+    {
+      "epoch": 0.45701836953368735,
+      "grad_norm": 0.9301067590713501,
+      "learning_rate": 0.00016684803259735953,
+      "loss": 3.4344864908854165,
+      "step": 57900
+    },
+    {
+      "epoch": 0.45938634036028675,
+      "grad_norm": 0.9871979355812073,
+      "learning_rate": 0.00016612041296456493,
+      "loss": 3.4341312662760415,
+      "step": 58200
+    },
+    {
+      "epoch": 0.4617543111868862,
+      "grad_norm": 1.0220112800598145,
+      "learning_rate": 0.00016539279333177028,
+      "loss": 3.433006591796875,
+      "step": 58500
+    },
+    {
+      "epoch": 0.4641222820134856,
+      "grad_norm": 0.8772982358932495,
+      "learning_rate": 0.00016466517369897568,
+      "loss": 3.432755940755208,
+      "step": 58800
+    },
+    {
+      "epoch": 0.466490252840085,
+      "grad_norm": 0.8644952774047852,
+      "learning_rate": 0.00016393755406618102,
+      "loss": 3.433555908203125,
+      "step": 59100
+    },
+    {
+      "epoch": 0.46885822366668445,
+      "grad_norm": 0.8992140889167786,
+      "learning_rate": 0.00016320993443338642,
+      "loss": 3.4328841145833335,
+      "step": 59400
+    },
+    {
+      "epoch": 0.47122619449328385,
+      "grad_norm": 0.928390383720398,
+      "learning_rate": 0.0001624823148005918,
+      "loss": 3.4311100260416665,
+      "step": 59700
+    },
+    {
+      "epoch": 0.47359416531988324,
+      "grad_norm": 0.8753233551979065,
+      "learning_rate": 0.00016175469516779714,
+      "loss": 3.431182861328125,
+      "step": 60000
+    },
+    {
+      "epoch": 0.47359416531988324,
+      "eval_loss": 3.432565450668335,
+      "eval_runtime": 741.1721,
+      "eval_samples_per_second": 123.38,
+      "eval_steps_per_second": 15.423,
+      "step": 60000
+    },
+    {
+      "epoch": 0.4759621361464827,
+      "grad_norm": 0.9439966082572937,
+      "learning_rate": 0.00016102707553500254,
+      "loss": 3.431356201171875,
+      "step": 60300
+    },
+    {
+      "epoch": 0.4783301069730821,
+      "grad_norm": 0.9623426198959351,
+      "learning_rate": 0.0001602994559022079,
+      "loss": 3.4319246419270835,
+      "step": 60600
+    },
+    {
+      "epoch": 0.4806980777996815,
+      "grad_norm": 1.1217174530029297,
+      "learning_rate": 0.0001595718362694133,
+      "loss": 3.4286568196614584,
+      "step": 60900
+    },
+    {
+      "epoch": 0.48306604862628094,
+      "grad_norm": 0.9596153497695923,
+      "learning_rate": 0.00015884421663661864,
+      "loss": 3.4278682454427085,
+      "step": 61200
+    },
+    {
+      "epoch": 0.48543401945288034,
+      "grad_norm": 1.0474796295166016,
+      "learning_rate": 0.00015811659700382404,
+      "loss": 3.429913330078125,
+      "step": 61500
+    },
+    {
+      "epoch": 0.48780199027947974,
+      "grad_norm": 0.8793846368789673,
+      "learning_rate": 0.0001573889773710294,
+      "loss": 3.4282792154947916,
+      "step": 61800
+    },
+    {
+      "epoch": 0.4901699611060792,
+      "grad_norm": 0.9113200902938843,
+      "learning_rate": 0.00015666135773823478,
+      "loss": 3.4286865234375,
+      "step": 62100
+    },
+    {
+      "epoch": 0.4925379319326786,
+      "grad_norm": 0.9840859174728394,
+      "learning_rate": 0.00015593373810544016,
+      "loss": 3.4279158528645834,
+      "step": 62400
+    },
+    {
+      "epoch": 0.494905902759278,
+      "grad_norm": 0.9114407896995544,
+      "learning_rate": 0.00015520611847264556,
+      "loss": 3.4287504069010417,
+      "step": 62700
+    },
+    {
+      "epoch": 0.49727387358587744,
+      "grad_norm": 0.8448418378829956,
+      "learning_rate": 0.0001544784988398509,
+      "loss": 3.425767415364583,
+      "step": 63000
+    },
+    {
+      "epoch": 0.49727387358587744,
+      "eval_loss": 3.428375244140625,
+      "eval_runtime": 744.2897,
+      "eval_samples_per_second": 122.863,
+      "eval_steps_per_second": 15.358,
+      "step": 63000
+    },
+    {
+      "epoch": 0.49964184441247683,
+      "grad_norm": 0.9205408096313477,
+      "learning_rate": 0.00015375087920705628,
+      "loss": 3.4254256184895833,
+      "step": 63300
+    },
+    {
+      "epoch": 0.5020098152390763,
+      "grad_norm": 1.0273767709732056,
+      "learning_rate": 0.00015302325957426165,
+      "loss": 3.4268212890625,
+      "step": 63600
+    },
+    {
+      "epoch": 0.5043777860656756,
+      "grad_norm": 0.8491344451904297,
+      "learning_rate": 0.00015229563994146702,
+      "loss": 3.425308837890625,
+      "step": 63900
+    },
+    {
+      "epoch": 0.5067457568922751,
+      "grad_norm": 0.9568387269973755,
+      "learning_rate": 0.00015156802030867242,
+      "loss": 3.4257462565104166,
+      "step": 64200
+    },
+    {
+      "epoch": 0.5091137277188745,
+      "grad_norm": 0.912223756313324,
+      "learning_rate": 0.00015084040067587777,
+      "loss": 3.42352783203125,
+      "step": 64500
+    },
+    {
+      "epoch": 0.5114816985454739,
+      "grad_norm": 0.8855020403862,
+      "learning_rate": 0.00015011278104308317,
+      "loss": 3.42487060546875,
+      "step": 64800
+    },
+    {
+      "epoch": 0.5138496693720733,
+      "grad_norm": 0.9171079993247986,
+      "learning_rate": 0.00014938516141028852,
+      "loss": 3.425098876953125,
+      "step": 65100
+    },
+    {
+      "epoch": 0.5162176401986728,
+      "grad_norm": 0.9581006169319153,
+      "learning_rate": 0.0001486575417774939,
+      "loss": 3.422734781901042,
+      "step": 65400
+    },
+    {
+      "epoch": 0.5185856110252721,
+      "grad_norm": 0.9489786624908447,
+      "learning_rate": 0.00014792992214469926,
+      "loss": 3.4213761393229167,
+      "step": 65700
+    },
+    {
+      "epoch": 0.5209535818518716,
+      "grad_norm": 0.9567045569419861,
+      "learning_rate": 0.00014720230251190466,
+      "loss": 3.4233223470052083,
+      "step": 66000
+    },
+    {
+      "epoch": 0.5209535818518716,
+      "eval_loss": 3.423550844192505,
+      "eval_runtime": 747.1444,
+      "eval_samples_per_second": 122.394,
+      "eval_steps_per_second": 15.3,
+      "step": 66000
+    },
+    {
+      "epoch": 0.523321552678471,
+      "grad_norm": 0.9048585891723633,
+      "learning_rate": 0.00014647468287911004,
+      "loss": 3.421425374348958,
+      "step": 66300
+    },
+    {
+      "epoch": 0.5256895235050704,
+      "grad_norm": 0.8782548904418945,
+      "learning_rate": 0.0001457470632463154,
+      "loss": 3.4216788736979167,
+      "step": 66600
+    },
+    {
+      "epoch": 0.5280574943316698,
+      "grad_norm": 0.9210931658744812,
+      "learning_rate": 0.00014501944361352078,
+      "loss": 3.420052083333333,
+      "step": 66900
+    },
+    {
+      "epoch": 0.5304254651582693,
+      "grad_norm": 0.8783855438232422,
+      "learning_rate": 0.00014429182398072616,
+      "loss": 3.4197184244791665,
+      "step": 67200
+    },
+    {
+      "epoch": 0.5327934359848686,
+      "grad_norm": 0.9573942422866821,
+      "learning_rate": 0.00014356420434793153,
+      "loss": 3.4213602701822916,
+      "step": 67500
+    },
+    {
+      "epoch": 0.5351614068114681,
+      "grad_norm": 0.8869844079017639,
+      "learning_rate": 0.0001428365847151369,
+      "loss": 3.4201835123697917,
+      "step": 67800
+    },
+    {
+      "epoch": 0.5375293776380675,
+      "grad_norm": 0.9585769176483154,
+      "learning_rate": 0.00014210896508234228,
+      "loss": 3.420860595703125,
+      "step": 68100
+    },
+    {
+      "epoch": 0.539897348464667,
+      "grad_norm": 0.8971334099769592,
+      "learning_rate": 0.00014138134544954765,
+      "loss": 3.4193851725260416,
+      "step": 68400
+    },
+    {
+      "epoch": 0.5422653192912663,
+      "grad_norm": 0.9371477365493774,
+      "learning_rate": 0.00014065372581675302,
+      "loss": 3.4193025716145833,
+      "step": 68700
+    },
+    {
+      "epoch": 0.5446332901178658,
+      "grad_norm": 0.9081939458847046,
+      "learning_rate": 0.0001399261061839584,
+      "loss": 3.4187516276041667,
+      "step": 69000
+    },
+    {
+      "epoch": 0.5446332901178658,
+      "eval_loss": 3.4199793338775635,
+      "eval_runtime": 750.7467,
+      "eval_samples_per_second": 121.807,
+      "eval_steps_per_second": 15.226,
+      "step": 69000
+    },
+    {
+      "epoch": 0.5470012609444652,
+      "grad_norm": 0.9167564511299133,
+      "learning_rate": 0.00013919848655116377,
+      "loss": 3.419979248046875,
+      "step": 69300
+    },
+    {
+      "epoch": 0.5493692317710646,
+      "grad_norm": 0.9261924624443054,
+      "learning_rate": 0.00013847086691836914,
+      "loss": 3.4188728841145832,
+      "step": 69600
+    },
+    {
+      "epoch": 0.551737202597664,
+      "grad_norm": 0.9604618549346924,
+      "learning_rate": 0.00013774324728557452,
+      "loss": 3.4166548665364584,
+      "step": 69900
+    },
+    {
+      "epoch": 0.5541051734242635,
+      "grad_norm": 0.9066981673240662,
+      "learning_rate": 0.0001370156276527799,
+      "loss": 3.4174702962239585,
+      "step": 70200
+    },
+    {
+      "epoch": 0.5564731442508628,
+      "grad_norm": 0.9094845056533813,
+      "learning_rate": 0.0001362880080199853,
+      "loss": 3.417271728515625,
+      "step": 70500
+    },
+    {
+      "epoch": 0.5588411150774623,
+      "grad_norm": 0.8867021799087524,
+      "learning_rate": 0.00013556038838719066,
+      "loss": 3.4159663899739585,
+      "step": 70800
+    },
+    {
+      "epoch": 0.5612090859040617,
+      "grad_norm": 0.8904138207435608,
+      "learning_rate": 0.00013483276875439604,
+      "loss": 3.41653564453125,
+      "step": 71100
+    },
+    {
+      "epoch": 0.563577056730661,
+      "grad_norm": 0.9619826674461365,
+      "learning_rate": 0.00013410514912160138,
+      "loss": 3.417818603515625,
+      "step": 71400
+    },
+    {
+      "epoch": 0.5659450275572605,
+      "grad_norm": 0.9813979864120483,
+      "learning_rate": 0.00013337752948880676,
+      "loss": 3.4158150227864583,
+      "step": 71700
+    },
+    {
+      "epoch": 0.56831299838386,
+      "grad_norm": 0.946321427822113,
+      "learning_rate": 0.00013264990985601213,
+      "loss": 3.416580810546875,
+      "step": 72000
+    },
+    {
+      "epoch": 0.56831299838386,
+      "eval_loss": 3.4163966178894043,
+      "eval_runtime": 745.3651,
+      "eval_samples_per_second": 122.686,
+      "eval_steps_per_second": 15.336,
+      "step": 72000
+    },
+    {
+      "epoch": 0.5706809692104593,
+      "grad_norm": 0.9013136625289917,
+      "learning_rate": 0.00013192229022321753,
+      "loss": 3.414710693359375,
+      "step": 72300
+    },
+    {
+      "epoch": 0.5730489400370588,
+      "grad_norm": 0.8942293524742126,
+      "learning_rate": 0.0001311946705904229,
+      "loss": 3.4147599283854166,
+      "step": 72600
+    },
+    {
+      "epoch": 0.5754169108636582,
+      "grad_norm": 1.0226171016693115,
+      "learning_rate": 0.00013046705095762828,
+      "loss": 3.41486328125,
+      "step": 72900
+    },
+    {
+      "epoch": 0.5777848816902575,
+      "grad_norm": 0.9574885964393616,
+      "learning_rate": 0.00012973943132483365,
+      "loss": 3.4147432454427085,
+      "step": 73200
+    },
+    {
+      "epoch": 0.580152852516857,
+      "grad_norm": 0.932759165763855,
+      "learning_rate": 0.00012901181169203902,
+      "loss": 3.4122733561197918,
+      "step": 73500
+    },
+    {
+      "epoch": 0.5825208233434565,
+      "grad_norm": 0.8803161382675171,
+      "learning_rate": 0.0001282841920592444,
+      "loss": 3.414296875,
+      "step": 73800
+    },
+    {
+      "epoch": 0.5848887941700558,
+      "grad_norm": 1.021998643875122,
+      "learning_rate": 0.00012755657242644977,
+      "loss": 3.4131294759114583,
+      "step": 74100
+    },
+    {
+      "epoch": 0.5872567649966552,
+      "grad_norm": 0.968071460723877,
+      "learning_rate": 0.00012682895279365514,
+      "loss": 3.413003133138021,
+      "step": 74400
+    },
+    {
+      "epoch": 0.5896247358232547,
+      "grad_norm": 1.0821932554244995,
+      "learning_rate": 0.00012610133316086052,
+      "loss": 3.4129736328125,
+      "step": 74700
+    },
+    {
+      "epoch": 0.591992706649854,
+      "grad_norm": 0.8909016847610474,
+      "learning_rate": 0.0001253737135280659,
+      "loss": 3.412833048502604,
+      "step": 75000
+    },
+    {
+      "epoch": 0.591992706649854,
+      "eval_loss": 3.4130373001098633,
+      "eval_runtime": 749.7485,
+      "eval_samples_per_second": 121.969,
+      "eval_steps_per_second": 15.246,
+      "step": 75000
+    },
+    {
+      "epoch": 0.5943606774764535,
+      "grad_norm": 1.0038580894470215,
+      "learning_rate": 0.00012464609389527126,
+      "loss": 3.4115226236979166,
+      "step": 75300
+    },
+    {
+      "epoch": 0.5967286483030529,
+      "grad_norm": 0.9283474683761597,
+      "learning_rate": 0.00012391847426247664,
+      "loss": 3.409629109700521,
+      "step": 75600
+    },
+    {
+      "epoch": 0.5990966191296523,
+      "grad_norm": 0.9440540671348572,
+      "learning_rate": 0.000123190854629682,
+      "loss": 3.4109674072265626,
+      "step": 75900
+    },
+    {
+      "epoch": 0.6014645899562517,
+      "grad_norm": 0.8556481003761292,
+      "learning_rate": 0.00012246323499688738,
+      "loss": 3.410178019205729,
+      "step": 76200
+    },
+    {
+      "epoch": 0.6038325607828512,
+      "grad_norm": 0.9701557159423828,
+      "learning_rate": 0.00012173561536409277,
+      "loss": 3.4105710856119793,
+      "step": 76500
+    },
+    {
+      "epoch": 0.6062005316094505,
+      "grad_norm": 0.9372541904449463,
+      "learning_rate": 0.00012100799573129814,
+      "loss": 3.408436075846354,
+      "step": 76800
+    },
+    {
+      "epoch": 0.60856850243605,
+      "grad_norm": 0.9367093443870544,
+      "learning_rate": 0.00012028037609850352,
+      "loss": 3.408902791341146,
+      "step": 77100
+    },
+    {
+      "epoch": 0.6109364732626494,
+      "grad_norm": 0.9765172004699707,
+      "learning_rate": 0.0001195527564657089,
+      "loss": 3.410556437174479,
+      "step": 77400
+    },
+    {
+      "epoch": 0.6133044440892488,
+      "grad_norm": 0.9547196626663208,
+      "learning_rate": 0.00011882513683291428,
+      "loss": 3.409205118815104,
+      "step": 77700
+    },
+    {
+      "epoch": 0.6156724149158482,
+      "grad_norm": 0.9072284698486328,
+      "learning_rate": 0.00011809751720011964,
+      "loss": 3.4097003173828124,
+      "step": 78000
+    },
+    {
+      "epoch": 0.6156724149158482,
+      "eval_loss": 3.4099109172821045,
+      "eval_runtime": 817.8309,
+      "eval_samples_per_second": 111.815,
+      "eval_steps_per_second": 13.977,
+      "step": 78000
+    },
+    {
+      "epoch": 0.6180403857424477,
+      "grad_norm": 0.8856763243675232,
+      "learning_rate": 0.00011736989756732501,
+      "loss": 3.4088720703125,
+      "step": 78300
+    },
+    {
+      "epoch": 0.620408356569047,
+      "grad_norm": 0.9035690426826477,
+      "learning_rate": 0.00011664227793453038,
+      "loss": 3.4085628255208333,
+      "step": 78600
+    },
+    {
+      "epoch": 0.6227763273956465,
+      "grad_norm": 0.9037633538246155,
+      "learning_rate": 0.00011591465830173576,
+      "loss": 3.4089361572265626,
+      "step": 78900
+    },
+    {
+      "epoch": 0.6251442982222459,
+      "grad_norm": 0.91520756483078,
+      "learning_rate": 0.00011518703866894114,
+      "loss": 3.408426513671875,
+      "step": 79200
+    },
+    {
+      "epoch": 0.6275122690488453,
+      "grad_norm": 0.9762275815010071,
+      "learning_rate": 0.00011445941903614652,
+      "loss": 3.4068282063802084,
+      "step": 79500
+    },
+    {
+      "epoch": 0.6298802398754447,
+      "grad_norm": 0.9305161833763123,
+      "learning_rate": 0.00011373179940335189,
+      "loss": 3.403935953776042,
+      "step": 79800
+    },
+    {
+      "epoch": 0.6322482107020442,
+      "grad_norm": 0.8783509135246277,
+      "learning_rate": 0.00011300417977055726,
+      "loss": 3.4058880615234375,
+      "step": 80100
+    },
+    {
+      "epoch": 0.6346161815286435,
+      "grad_norm": 0.9528281092643738,
+      "learning_rate": 0.00011227656013776265,
+      "loss": 3.4069441731770835,
+      "step": 80400
+    },
+    {
+      "epoch": 0.636984152355243,
+      "grad_norm": 0.9882702827453613,
+      "learning_rate": 0.00011154894050496802,
+      "loss": 3.407682088216146,
+      "step": 80700
+    },
+    {
+      "epoch": 0.6393521231818424,
+      "grad_norm": 0.9206790328025818,
+      "learning_rate": 0.0001108213208721734,
+      "loss": 3.4039154052734375,
+      "step": 81000
+    },
+    {
+      "epoch": 0.6393521231818424,
+      "eval_loss": 3.40670108795166,
+      "eval_runtime": 820.9942,
+      "eval_samples_per_second": 111.384,
+      "eval_steps_per_second": 13.923,
+      "step": 81000
+    },
+    {
+      "epoch": 0.6417200940084418,
+      "grad_norm": 0.925731360912323,
+      "learning_rate": 0.00011009370123937877,
+      "loss": 3.406180419921875,
+      "step": 81300
+    },
+    {
+      "epoch": 0.6440880648350412,
+      "grad_norm": 0.9606015086174011,
+      "learning_rate": 0.00010936608160658413,
+      "loss": 3.406502482096354,
+      "step": 81600
+    },
+    {
+      "epoch": 0.6464560356616407,
+      "grad_norm": 0.9114608764648438,
+      "learning_rate": 0.0001086384619737895,
+      "loss": 3.4049051920572917,
+      "step": 81900
+    },
+    {
+      "epoch": 0.64882400648824,
+      "grad_norm": 0.905232310295105,
+      "learning_rate": 0.00010791084234099489,
+      "loss": 3.4034344482421877,
+      "step": 82200
+    },
+    {
+      "epoch": 0.6511919773148395,
+      "grad_norm": 0.9017631411552429,
+      "learning_rate": 0.00010718322270820026,
+      "loss": 3.4042232259114584,
+      "step": 82500
+    },
+    {
+      "epoch": 0.6535599481414389,
+      "grad_norm": 0.9682673215866089,
+      "learning_rate": 0.00010645560307540564,
+      "loss": 3.40289794921875,
+      "step": 82800
+    },
+    {
+      "epoch": 0.6559279189680383,
+      "grad_norm": 0.9999443888664246,
+      "learning_rate": 0.00010572798344261101,
+      "loss": 3.4032177734375,
+      "step": 83100
+    },
+    {
+      "epoch": 0.6582958897946377,
+      "grad_norm": 0.9269556403160095,
+      "learning_rate": 0.00010500036380981638,
+      "loss": 3.4007576497395835,
+      "step": 83400
+    },
+    {
+      "epoch": 0.6606638606212372,
+      "grad_norm": 0.9589893817901611,
+      "learning_rate": 0.00010427274417702177,
+      "loss": 3.4032047526041667,
+      "step": 83700
+    },
+    {
+      "epoch": 0.6630318314478366,
+      "grad_norm": 0.8739861249923706,
+      "learning_rate": 0.00010354512454422714,
+      "loss": 3.400506795247396,
+      "step": 84000
+    },
+    {
+      "epoch": 0.6630318314478366,
+      "eval_loss": 3.4034643173217773,
+      "eval_runtime": 825.994,
+      "eval_samples_per_second": 110.71,
+      "eval_steps_per_second": 13.839,
+      "step": 84000
+    },
+    {
+      "epoch": 0.665399802274436,
+      "grad_norm": 0.9189532399177551,
+      "learning_rate": 0.00010281750491143252,
+      "loss": 3.402564697265625,
+      "step": 84300
+    },
+    {
+      "epoch": 0.6677677731010354,
+      "grad_norm": 0.9483964443206787,
+      "learning_rate": 0.00010208988527863789,
+      "loss": 3.4014821370442707,
+      "step": 84600
+    },
+    {
+      "epoch": 0.6701357439276349,
+      "grad_norm": 0.9440945386886597,
+      "learning_rate": 0.00010136226564584325,
+      "loss": 3.400501912434896,
+      "step": 84900
+    },
+    {
+      "epoch": 0.6725037147542342,
+      "grad_norm": 0.9399909973144531,
+      "learning_rate": 0.00010063464601304864,
+      "loss": 3.4025087483723957,
+      "step": 85200
+    },
+    {
+      "epoch": 0.6748716855808337,
+      "grad_norm": 0.9791007041931152,
+      "learning_rate": 9.990702638025401e-05,
+      "loss": 3.399855753580729,
+      "step": 85500
+    },
+    {
+      "epoch": 0.6772396564074331,
+      "grad_norm": 0.8749154210090637,
+      "learning_rate": 9.917940674745938e-05,
+      "loss": 3.4002801513671876,
+      "step": 85800
+    },
+    {
+      "epoch": 0.6796076272340325,
+      "grad_norm": 0.9691766500473022,
+      "learning_rate": 9.845178711466476e-05,
+      "loss": 3.4014939371744792,
+      "step": 86100
+    },
+    {
+      "epoch": 0.6819755980606319,
+      "grad_norm": 0.9257389307022095,
+      "learning_rate": 9.772416748187013e-05,
+      "loss": 3.4006268310546877,
+      "step": 86400
+    },
+    {
+      "epoch": 0.6843435688872314,
+      "grad_norm": 0.961001992225647,
+      "learning_rate": 9.699654784907552e-05,
+      "loss": 3.400779215494792,
+      "step": 86700
+    },
+    {
+      "epoch": 0.6867115397138307,
+      "grad_norm": 0.9671078324317932,
+      "learning_rate": 9.626892821628089e-05,
+      "loss": 3.3995322672526043,
+      "step": 87000
+    },
+    {
+      "epoch": 0.6867115397138307,
+      "eval_loss": 3.4005441665649414,
+      "eval_runtime": 830.8743,
+      "eval_samples_per_second": 110.06,
+      "eval_steps_per_second": 13.758,
+      "step": 87000
+    },
+    {
+      "epoch": 0.6890795105404302,
+      "grad_norm": 0.9077408909797668,
+      "learning_rate": 9.554130858348626e-05,
+      "loss": 3.398675333658854,
+      "step": 87300
+    },
+    {
+      "epoch": 0.6914474813670296,
+      "grad_norm": 0.934999942779541,
+      "learning_rate": 9.481368895069164e-05,
+      "loss": 3.398826090494792,
+      "step": 87600
+    },
+    {
+      "epoch": 0.693815452193629,
+      "grad_norm": 0.9157312512397766,
+      "learning_rate": 9.408606931789702e-05,
+      "loss": 3.3968377685546876,
+      "step": 87900
+    },
+    {
+      "epoch": 0.6961834230202284,
+      "grad_norm": 0.9654635190963745,
+      "learning_rate": 9.335844968510237e-05,
+      "loss": 3.398967081705729,
+      "step": 88200
+    },
+    {
+      "epoch": 0.6985513938468279,
+      "grad_norm": 0.9347310066223145,
+      "learning_rate": 9.263083005230776e-05,
+      "loss": 3.398814493815104,
+      "step": 88500
+    },
+    {
+      "epoch": 0.7009193646734272,
+      "grad_norm": 0.9343422651290894,
+      "learning_rate": 9.190321041951313e-05,
+      "loss": 3.3974904378255206,
+      "step": 88800
+    }
+  ],
+  "logging_steps": 300,
+  "max_steps": 126691,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.2594298740736e+16,
+  "train_batch_size": 72,
+  "trial_name": null,
+  "trial_params": null
+}