diff --git "a/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json" "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json"
@@ -0,0 +1,4432 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.000969696969697,
+  "eval_steps": 516,
+  "global_step": 3093,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 0.8898435831069946,
+      "eval_ppl": 2.43475,
+      "eval_runtime": 12.6383,
+      "eval_samples_per_second": 15.825,
+      "eval_steps_per_second": 7.912,
+      "memory/device_reserved (GiB)": 13.84,
+      "memory/max_active (GiB)": 13.69,
+      "memory/max_allocated (GiB)": 13.69,
+      "step": 0
+    },
+    {
+      "epoch": 0.009696969696969697,
+      "grad_norm": 2.995619058609009,
+      "learning_rate": 3.4951456310679615e-06,
+      "loss": 0.8680612564086914,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.38229,
+      "step": 10,
+      "tokens/total": 163840,
+      "tokens/train_per_sec_per_gpu": 14.27,
+      "tokens/trainable": 51990
+    },
+    {
+      "epoch": 0.019393939393939394,
+      "grad_norm": 2.1244935989379883,
+      "learning_rate": 7.378640776699029e-06,
+      "loss": 0.7699687004089355,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 2.1597,
+      "step": 20,
+      "tokens/total": 327680,
+      "tokens/train_per_sec_per_gpu": 16.06,
+      "tokens/trainable": 104391
+    },
+    {
+      "epoch": 0.02909090909090909,
+      "grad_norm": 0.9706138372421265,
+      "learning_rate": 1.1262135922330098e-05,
+      "loss": 0.5319457054138184,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.70224,
+      "step": 30,
+      "tokens/total": 491520,
+      "tokens/train_per_sec_per_gpu": 16.48,
+      "tokens/trainable": 156787
+    },
+    {
+      "epoch": 0.03878787878787879,
+      "grad_norm": 0.7689842581748962,
+      "learning_rate": 1.5145631067961166e-05,
+      "loss": 0.30234951972961427,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.35303,
+      "step": 40,
+      "tokens/total": 655360,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 208924
+    },
+    {
+      "epoch": 0.048484848484848485,
+      "grad_norm": 0.45850396156311035,
+      "learning_rate": 1.9029126213592234e-05,
+      "loss": 0.1519382953643799,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.16409,
+      "step": 50,
+      "tokens/total": 819200,
+      "tokens/train_per_sec_per_gpu": 14.61,
+      "tokens/trainable": 261170
+    },
+    {
+      "epoch": 0.05818181818181818,
+      "grad_norm": 0.41381561756134033,
+      "learning_rate": 2.29126213592233e-05,
+      "loss": 0.062263429164886475,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.06424,
+      "step": 60,
+      "tokens/total": 983040,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 313808
+    },
+    {
+      "epoch": 0.06787878787878789,
+      "grad_norm": 0.4865979254245758,
+      "learning_rate": 2.6796116504854367e-05,
+      "loss": 0.018695920705795288,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01887,
+      "step": 70,
+      "tokens/total": 1146880,
+      "tokens/train_per_sec_per_gpu": 14.62,
+      "tokens/trainable": 366068
+    },
+    {
+      "epoch": 0.07757575757575758,
+      "grad_norm": 0.39099738001823425,
+      "learning_rate": 3.067961165048544e-05,
+      "loss": 0.006136053055524826,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00615,
+      "step": 80,
+      "tokens/total": 1310720,
+      "tokens/train_per_sec_per_gpu": 13.81,
+      "tokens/trainable": 418120
+    },
+    {
+      "epoch": 0.08727272727272728,
+      "grad_norm": 0.08230593055486679,
+      "learning_rate": 3.456310679611651e-05,
+      "loss": 0.004204501211643219,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00421,
+      "step": 90,
+      "tokens/total": 1474560,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 470244
+    },
+    {
+      "epoch": 0.09696969696969697,
+      "grad_norm": 0.13297680020332336,
+      "learning_rate": 3.844660194174757e-05,
+      "loss": 0.0036250378936529158,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00363,
+      "step": 100,
+      "tokens/total": 1638400,
+      "tokens/train_per_sec_per_gpu": 14.91,
+      "tokens/trainable": 522666
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.2430051565170288,
+      "learning_rate": 4.2330097087378647e-05,
+      "loss": 0.003873714804649353,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00388,
+      "step": 110,
+      "tokens/total": 1802240,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 574329
+    },
+    {
+      "epoch": 0.11636363636363636,
+      "grad_norm": 0.09347938001155853,
+      "learning_rate": 4.621359223300971e-05,
+      "loss": 0.00237951148301363,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00238,
+      "step": 120,
+      "tokens/total": 1966080,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 626194
+    },
+    {
+      "epoch": 0.12606060606060607,
+      "grad_norm": 0.13388365507125854,
+      "learning_rate": 5.0097087378640786e-05,
+      "loss": 0.0015400107949972153,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00154,
+      "step": 130,
+      "tokens/total": 2129920,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 678140
+    },
+    {
+      "epoch": 0.13575757575757577,
+      "grad_norm": 0.13342970609664917,
+      "learning_rate": 5.398058252427185e-05,
+      "loss": 0.001996887102723122,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.002,
+      "step": 140,
+      "tokens/total": 2293760,
+      "tokens/train_per_sec_per_gpu": 14.41,
+      "tokens/trainable": 730201
+    },
+    {
+      "epoch": 0.14545454545454545,
+      "grad_norm": 0.0299234539270401,
+      "learning_rate": 5.786407766990292e-05,
+      "loss": 0.0015132850036025046,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00151,
+      "step": 150,
+      "tokens/total": 2457600,
+      "tokens/train_per_sec_per_gpu": 15.8,
+      "tokens/trainable": 782196
+    },
+    {
+      "epoch": 0.15515151515151515,
+      "grad_norm": 0.04437975212931633,
+      "learning_rate": 6.174757281553398e-05,
+      "loss": 0.0012883609160780907,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00129,
+      "step": 160,
+      "tokens/total": 2621440,
+      "tokens/train_per_sec_per_gpu": 14.64,
+      "tokens/trainable": 833614
+    },
+    {
+      "epoch": 0.16484848484848486,
+      "grad_norm": 0.014039761386811733,
+      "learning_rate": 6.563106796116505e-05,
+      "loss": 0.0011639594100415706,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 170,
+      "tokens/total": 2785280,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 885591
+    },
+    {
+      "epoch": 0.17454545454545456,
+      "grad_norm": 0.0033261056523770094,
+      "learning_rate": 6.951456310679612e-05,
+      "loss": 0.0007388167083263397,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00074,
+      "step": 180,
+      "tokens/total": 2949120,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 937712
+    },
+    {
+      "epoch": 0.18424242424242424,
+      "grad_norm": 0.010476192459464073,
+      "learning_rate": 7.339805825242719e-05,
+      "loss": 0.0008642122149467469,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 190,
+      "tokens/total": 3112960,
+      "tokens/train_per_sec_per_gpu": 15.52,
+      "tokens/trainable": 989913
+    },
+    {
+      "epoch": 0.19393939393939394,
+      "grad_norm": 0.01253255270421505,
+      "learning_rate": 7.728155339805826e-05,
+      "loss": 0.0007610846310853958,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 200,
+      "tokens/total": 3276800,
+      "tokens/train_per_sec_per_gpu": 14.17,
+      "tokens/trainable": 1041978
+    },
+    {
+      "epoch": 0.20363636363636364,
+      "grad_norm": 0.01779557578265667,
+      "learning_rate": 8.116504854368933e-05,
+      "loss": 0.0007697530556470156,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00077,
+      "step": 210,
+      "tokens/total": 3440640,
+      "tokens/train_per_sec_per_gpu": 14.12,
+      "tokens/trainable": 1093395
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.16895800828933716,
+      "learning_rate": 8.504854368932039e-05,
+      "loss": 0.0006535804830491542,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 220,
+      "tokens/total": 3604480,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1145329
+    },
+    {
+      "epoch": 0.22303030303030302,
+      "grad_norm": 0.08973463624715805,
+      "learning_rate": 8.893203883495146e-05,
+      "loss": 0.0009510296396911145,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 230,
+      "tokens/total": 3768320,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 1197537
+    },
+    {
+      "epoch": 0.23272727272727273,
+      "grad_norm": 0.044939588755369186,
+      "learning_rate": 9.281553398058253e-05,
+      "loss": 0.001187363639473915,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00119,
+      "step": 240,
+      "tokens/total": 3932160,
+      "tokens/train_per_sec_per_gpu": 15.39,
+      "tokens/trainable": 1249924
+    },
+    {
+      "epoch": 0.24242424242424243,
+      "grad_norm": 0.08850465714931488,
+      "learning_rate": 9.66990291262136e-05,
+      "loss": 0.0013382930308580398,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00134,
+      "step": 250,
+      "tokens/total": 4096000,
+      "tokens/train_per_sec_per_gpu": 15.06,
+      "tokens/trainable": 1301558
+    },
+    {
+      "epoch": 0.25212121212121213,
+      "grad_norm": 0.101528100669384,
+      "learning_rate": 0.00010058252427184467,
+      "loss": 0.0008709387853741646,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 260,
+      "tokens/total": 4259840,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 1353706
+    },
+    {
+      "epoch": 0.26181818181818184,
+      "grad_norm": 0.08298433572053909,
+      "learning_rate": 0.00010446601941747574,
+      "loss": 0.0013300922699272632,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 270,
+      "tokens/total": 4423680,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 1405519
+    },
+    {
+      "epoch": 0.27151515151515154,
+      "grad_norm": 0.03734389320015907,
+      "learning_rate": 0.00010834951456310681,
+      "loss": 0.0006868645548820495,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00069,
+      "step": 280,
+      "tokens/total": 4587520,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 1457494
+    },
+    {
+      "epoch": 0.2812121212121212,
+      "grad_norm": 0.07898428291082382,
+      "learning_rate": 0.00011223300970873786,
+      "loss": 0.0013550779782235622,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00136,
+      "step": 290,
+      "tokens/total": 4751360,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 1509320
+    },
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 0.06320006400346756,
+      "learning_rate": 0.00011611650485436893,
+      "loss": 0.0010121697559952736,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00101,
+      "step": 300,
+      "tokens/total": 4915200,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 1561332
+    },
+    {
+      "epoch": 0.3006060606060606,
+      "grad_norm": 0.013749867677688599,
+      "learning_rate": 0.00012,
+      "loss": 0.0006499682553112507,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 310,
+      "tokens/total": 5079040,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 1613189
+    },
+    {
+      "epoch": 0.3103030303030303,
+      "grad_norm": 0.033964402973651886,
+      "learning_rate": 0.00012388349514563107,
+      "loss": 0.0008866124786436558,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00089,
+      "step": 320,
+      "tokens/total": 5242880,
+      "tokens/train_per_sec_per_gpu": 15.78,
+      "tokens/trainable": 1665681
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.04327597841620445,
+      "learning_rate": 0.00012776699029126213,
+      "loss": 0.0005569641944020987,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 330,
+      "tokens/total": 5406720,
+      "tokens/train_per_sec_per_gpu": 14.92,
+      "tokens/trainable": 1718317
+    },
+    {
+      "epoch": 0.3296969696969697,
+      "grad_norm": 0.02717934548854828,
+      "learning_rate": 0.0001316504854368932,
+      "loss": 0.0003776244120672345,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00038,
+      "step": 340,
+      "tokens/total": 5570560,
+      "tokens/train_per_sec_per_gpu": 14.42,
+      "tokens/trainable": 1770210
+    },
+    {
+      "epoch": 0.3393939393939394,
+      "grad_norm": 0.0028237912338227034,
+      "learning_rate": 0.0001355339805825243,
+      "loss": 0.0005292522720992566,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00053,
+      "step": 350,
+      "tokens/total": 5734400,
+      "tokens/train_per_sec_per_gpu": 16.4,
+      "tokens/trainable": 1821987
+    },
+    {
+      "epoch": 0.3490909090909091,
+      "grad_norm": 0.0310799703001976,
+      "learning_rate": 0.00013941747572815535,
+      "loss": 0.0006786303594708443,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 360,
+      "tokens/total": 5898240,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 1874266
+    },
+    {
+      "epoch": 0.35878787878787877,
+      "grad_norm": 0.17325043678283691,
+      "learning_rate": 0.0001433009708737864,
+      "loss": 0.0013975565321743487,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0014,
+      "step": 370,
+      "tokens/total": 6062080,
+      "tokens/train_per_sec_per_gpu": 13.73,
+      "tokens/trainable": 1926124
+    },
+    {
+      "epoch": 0.36848484848484847,
+      "grad_norm": 0.07738752663135529,
+      "learning_rate": 0.0001471844660194175,
+      "loss": 0.0006820175796747208,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 380,
+      "tokens/total": 6225920,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 1978693
+    },
+    {
+      "epoch": 0.3781818181818182,
+      "grad_norm": 0.10022349655628204,
+      "learning_rate": 0.00015106796116504855,
+      "loss": 0.00063879219815135,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 390,
+      "tokens/total": 6389760,
+      "tokens/train_per_sec_per_gpu": 13.34,
+      "tokens/trainable": 2030378
+    },
+    {
+      "epoch": 0.3878787878787879,
+      "grad_norm": 0.0495997779071331,
+      "learning_rate": 0.00015495145631067963,
+      "loss": 0.0021283581852912905,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00213,
+      "step": 400,
+      "tokens/total": 6553600,
+      "tokens/train_per_sec_per_gpu": 15.34,
+      "tokens/trainable": 2083047
+    },
+    {
+      "epoch": 0.3975757575757576,
+      "grad_norm": 0.07361701130867004,
+      "learning_rate": 0.0001588349514563107,
+      "loss": 0.001862115040421486,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00186,
+      "step": 410,
+      "tokens/total": 6717440,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 2135527
+    },
+    {
+      "epoch": 0.4072727272727273,
+      "grad_norm": 0.05466209724545479,
+      "learning_rate": 0.00016271844660194174,
+      "loss": 0.0011581303551793098,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00116,
+      "step": 420,
+      "tokens/total": 6881280,
+      "tokens/train_per_sec_per_gpu": 14.77,
+      "tokens/trainable": 2187636
+    },
+    {
+      "epoch": 0.416969696969697,
+      "grad_norm": 0.04331392049789429,
+      "learning_rate": 0.00016660194174757283,
+      "loss": 0.0051729224622249605,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00519,
+      "step": 430,
+      "tokens/total": 7045120,
+      "tokens/train_per_sec_per_gpu": 13.76,
+      "tokens/trainable": 2239006
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.05931795388460159,
+      "learning_rate": 0.00017048543689320388,
+      "loss": 0.00242764875292778,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00243,
+      "step": 440,
+      "tokens/total": 7208960,
+      "tokens/train_per_sec_per_gpu": 14.59,
+      "tokens/trainable": 2290540
+    },
+    {
+      "epoch": 0.43636363636363634,
+      "grad_norm": 0.04634418711066246,
+      "learning_rate": 0.00017436893203883494,
+      "loss": 0.001389546226710081,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00139,
+      "step": 450,
+      "tokens/total": 7372800,
+      "tokens/train_per_sec_per_gpu": 14.78,
+      "tokens/trainable": 2341852
+    },
+    {
+      "epoch": 0.44606060606060605,
+      "grad_norm": 0.04817213863134384,
+      "learning_rate": 0.00017825242718446602,
+      "loss": 0.001370794139802456,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00137,
+      "step": 460,
+      "tokens/total": 7536640,
+      "tokens/train_per_sec_per_gpu": 13.77,
+      "tokens/trainable": 2393320
+    },
+    {
+      "epoch": 0.45575757575757575,
+      "grad_norm": 0.011335949413478374,
+      "learning_rate": 0.00018213592233009708,
+      "loss": 0.0009715131483972073,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 470,
+      "tokens/total": 7700480,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 2445170
+    },
+    {
+      "epoch": 0.46545454545454545,
+      "grad_norm": 0.05298445746302605,
+      "learning_rate": 0.00018601941747572816,
+      "loss": 0.0008222623728215694,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 480,
+      "tokens/total": 7864320,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 2497473
+    },
+    {
+      "epoch": 0.47515151515151516,
+      "grad_norm": 0.061686884611845016,
+      "learning_rate": 0.00018990291262135925,
+      "loss": 0.000748783303424716,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00075,
+      "step": 490,
+      "tokens/total": 8028160,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 2549206
+    },
+    {
+      "epoch": 0.48484848484848486,
+      "grad_norm": 0.03281249850988388,
+      "learning_rate": 0.0001937864077669903,
+      "loss": 0.0006062469445168972,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00061,
+      "step": 500,
+      "tokens/total": 8192000,
+      "tokens/train_per_sec_per_gpu": 14.49,
+      "tokens/trainable": 2600583
+    },
+    {
+      "epoch": 0.49454545454545457,
+      "grad_norm": 0.008482079952955246,
+      "learning_rate": 0.0001976699029126214,
+      "loss": 0.0008583014830946922,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00086,
+      "step": 510,
+      "tokens/total": 8355840,
+      "tokens/train_per_sec_per_gpu": 13.86,
+      "tokens/trainable": 2652927
+    },
+    {
+      "epoch": 0.5003636363636363,
+      "eval_loss": 0.0009036393603309989,
+      "eval_ppl": 1.0009,
+      "eval_runtime": 12.7872,
+      "eval_samples_per_second": 15.641,
+      "eval_steps_per_second": 7.82,
+      "memory/device_reserved (GiB)": 18.85,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "step": 516
+    },
+    {
+      "epoch": 0.5042424242424243,
+      "grad_norm": 0.04333305358886719,
+      "learning_rate": 0.0001999996332640321,
+      "loss": 0.0005093200132250785,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 520,
+      "tokens/total": 8519680,
+      "tokens/train_per_sec_per_gpu": 14.09,
+      "tokens/trainable": 2705083
+    },
+    {
+      "epoch": 0.5139393939393939,
+      "grad_norm": 0.02485118806362152,
+      "learning_rate": 0.00019999550751528488,
+      "loss": 0.0006649125367403031,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 530,
+      "tokens/total": 8683520,
+      "tokens/train_per_sec_per_gpu": 14.44,
+      "tokens/trainable": 2756975
+    },
+    {
+      "epoch": 0.5236363636363637,
+      "grad_norm": 0.03736363351345062,
+      "learning_rate": 0.00019998679778759294,
+      "loss": 0.0006726076360791921,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 540,
+      "tokens/total": 8847360,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 2808076
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.05156765505671501,
+      "learning_rate": 0.0001999735044802263,
+      "loss": 0.000789718609303236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00079,
+      "step": 550,
+      "tokens/total": 9011200,
+      "tokens/train_per_sec_per_gpu": 16.36,
+      "tokens/trainable": 2859893
+    },
+    {
+      "epoch": 0.5430303030303031,
+      "grad_norm": 0.647550106048584,
+      "learning_rate": 0.00019995562820257474,
+      "loss": 0.003008325584232807,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00301,
+      "step": 560,
+      "tokens/total": 9175040,
+      "tokens/train_per_sec_per_gpu": 14.21,
+      "tokens/trainable": 2911399
+    },
+    {
+      "epoch": 0.5527272727272727,
+      "grad_norm": 0.185165673494339,
+      "learning_rate": 0.00019993316977411993,
+      "loss": 0.013715097308158874,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.01381,
+      "step": 570,
+      "tokens/total": 9338880,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 2962403
+    },
+    {
+      "epoch": 0.5624242424242424,
+      "grad_norm": 0.2401553839445114,
+      "learning_rate": 0.0001999061302243977,
+      "loss": 0.009026474505662917,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00907,
+      "step": 580,
+      "tokens/total": 9502720,
+      "tokens/train_per_sec_per_gpu": 14.38,
+      "tokens/trainable": 3015083
+    },
+    {
+      "epoch": 0.5721212121212121,
+      "grad_norm": 0.08092579245567322,
+      "learning_rate": 0.000199874510792951,
+      "loss": 0.005716494470834732,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00573,
+      "step": 590,
+      "tokens/total": 9666560,
+      "tokens/train_per_sec_per_gpu": 16.38,
+      "tokens/trainable": 3066501
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 3.418715476989746,
+      "learning_rate": 0.00019983831292927305,
+      "loss": 0.048504295945167544,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0497,
+      "step": 600,
+      "tokens/total": 9830400,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 3118633
+    },
+    {
+      "epoch": 0.5915151515151515,
+      "grad_norm": 0.2194036841392517,
+      "learning_rate": 0.00019979753829274085,
+      "loss": 0.03429323434829712,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.03489,
+      "step": 610,
+      "tokens/total": 9994240,
+      "tokens/train_per_sec_per_gpu": 13.14,
+      "tokens/trainable": 3170577
+    },
+    {
+      "epoch": 0.6012121212121212,
+      "grad_norm": 0.022929901257157326,
+      "learning_rate": 0.0001997521887525391,
+      "loss": 0.0015171168372035027,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00152,
+      "step": 620,
+      "tokens/total": 10158080,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 3221696
+    },
+    {
+      "epoch": 0.610909090909091,
+      "grad_norm": 0.10083670169115067,
+      "learning_rate": 0.00019970226638757458,
+      "loss": 0.0025377947837114333,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00254,
+      "step": 630,
+      "tokens/total": 10321920,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 3273775
+    },
+    {
+      "epoch": 0.6206060606060606,
+      "grad_norm": 0.01761380024254322,
+      "learning_rate": 0.00019964777348638083,
+      "loss": 0.002281896211206913,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00228,
+      "step": 640,
+      "tokens/total": 10485760,
+      "tokens/train_per_sec_per_gpu": 14.89,
+      "tokens/trainable": 3325516
+    },
+    {
+      "epoch": 0.6303030303030303,
+      "grad_norm": 0.004510029684752226,
+      "learning_rate": 0.00019958871254701315,
+      "loss": 0.0009477110579609871,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 650,
+      "tokens/total": 10649600,
+      "tokens/train_per_sec_per_gpu": 16.46,
+      "tokens/trainable": 3377214
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.05332477018237114,
+      "learning_rate": 0.0001995250862769342,
+      "loss": 0.0005660496186465025,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00057,
+      "step": 660,
+      "tokens/total": 10813440,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 3428627
+    },
+    {
+      "epoch": 0.6496969696969697,
+      "grad_norm": 0.03861689195036888,
+      "learning_rate": 0.0001994568975928899,
+      "loss": 0.0008976863697171211,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0009,
+      "step": 670,
+      "tokens/total": 10977280,
+      "tokens/train_per_sec_per_gpu": 15.66,
+      "tokens/trainable": 3480170
+    },
+    {
+      "epoch": 0.6593939393939394,
+      "grad_norm": 0.021123304963111877,
+      "learning_rate": 0.00019938414962077553,
+      "loss": 0.0009612766094505787,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00096,
+      "step": 680,
+      "tokens/total": 11141120,
+      "tokens/train_per_sec_per_gpu": 15.15,
+      "tokens/trainable": 3532037
+    },
+    {
+      "epoch": 0.6690909090909091,
+      "grad_norm": 0.02421347238123417,
+      "learning_rate": 0.00019930684569549264,
+      "loss": 0.001021684519946575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00102,
+      "step": 690,
+      "tokens/total": 11304960,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 3583461
+    },
+    {
+      "epoch": 0.6787878787878788,
+      "grad_norm": 0.05008835345506668,
+      "learning_rate": 0.00019922498936079613,
+      "loss": 0.0007617876864969731,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00076,
+      "step": 700,
+      "tokens/total": 11468800,
+      "tokens/train_per_sec_per_gpu": 14.08,
+      "tokens/trainable": 3634649
+    },
+    {
+      "epoch": 0.6884848484848485,
+      "grad_norm": 0.035733792930841446,
+      "learning_rate": 0.00019913858436913171,
+      "loss": 0.0012347914278507232,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00124,
+      "step": 710,
+      "tokens/total": 11632640,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 3685786
+    },
+    {
+      "epoch": 0.6981818181818182,
+      "grad_norm": 0.010948767885565758,
+      "learning_rate": 0.00019904763468146393,
+      "loss": 0.0008165687322616577,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 720,
+      "tokens/total": 11796480,
+      "tokens/train_per_sec_per_gpu": 15.77,
+      "tokens/trainable": 3737566
+    },
+    {
+      "epoch": 0.7078787878787879,
+      "grad_norm": 0.03577027469873428,
+      "learning_rate": 0.00019895214446709463,
+      "loss": 0.001333119161427021,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00133,
+      "step": 730,
+      "tokens/total": 11960320,
+      "tokens/train_per_sec_per_gpu": 13.98,
+      "tokens/trainable": 3789817
+    },
+    {
+      "epoch": 0.7175757575757575,
+      "grad_norm": 0.03971279785037041,
+      "learning_rate": 0.00019885211810347184,
+      "loss": 0.0011184611357748508,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00112,
+      "step": 740,
+      "tokens/total": 12124160,
+      "tokens/train_per_sec_per_gpu": 14.67,
+      "tokens/trainable": 3841912
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 0.06546575576066971,
+      "learning_rate": 0.00019874756017598894,
+      "loss": 0.0012452728115022182,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00125,
+      "step": 750,
+      "tokens/total": 12288000,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 3893725
+    },
+    {
+      "epoch": 0.7369696969696969,
+      "grad_norm": 0.047058816999197006,
+      "learning_rate": 0.00019863847547777467,
+      "loss": 0.0008146104402840138,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00081,
+      "step": 760,
+      "tokens/total": 12451840,
+      "tokens/train_per_sec_per_gpu": 13.49,
+      "tokens/trainable": 3945033
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.028811641037464142,
+      "learning_rate": 0.00019852486900947327,
+      "loss": 0.0008652995340526104,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00087,
+      "step": 770,
+      "tokens/total": 12615680,
+      "tokens/train_per_sec_per_gpu": 15.12,
+      "tokens/trainable": 3996749
+    },
+    {
+      "epoch": 0.7563636363636363,
+      "grad_norm": 0.012203546240925789,
+      "learning_rate": 0.0001984067459790153,
+      "loss": 0.000670672720298171,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 780,
+      "tokens/total": 12779520,
+      "tokens/train_per_sec_per_gpu": 13.71,
+      "tokens/trainable": 4048173
+    },
+    {
+      "epoch": 0.7660606060606061,
+      "grad_norm": 0.016218814998865128,
+      "learning_rate": 0.0001982841118013789,
+      "loss": 0.00046353964135050776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 790,
+      "tokens/total": 12943360,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 4099789
+    },
+    {
+      "epoch": 0.7757575757575758,
+      "grad_norm": 0.034673016518354416,
+      "learning_rate": 0.00019815697209834147,
+      "loss": 0.000707306619733572,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00071,
+      "step": 800,
+      "tokens/total": 13107200,
+      "tokens/train_per_sec_per_gpu": 14.45,
+      "tokens/trainable": 4150960
+    },
+    {
+      "epoch": 0.7854545454545454,
+      "grad_norm": 0.0022127812262624502,
+      "learning_rate": 0.00019802533269822208,
+      "loss": 0.00021896373946219682,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 810,
+      "tokens/total": 13271040,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4202984
+    },
+    {
+      "epoch": 0.7951515151515152,
+      "grad_norm": 0.000919274752959609,
+      "learning_rate": 0.00019788919963561422,
+      "loss": 0.00043264860287308695,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 820,
+      "tokens/total": 13434880,
+      "tokens/train_per_sec_per_gpu": 14.06,
+      "tokens/trainable": 4254907
+    },
+    {
+      "epoch": 0.8048484848484848,
+      "grad_norm": 0.007699873298406601,
+      "learning_rate": 0.00019774857915110913,
+      "loss": 0.0003196246922016144,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 830,
+      "tokens/total": 13598720,
+      "tokens/train_per_sec_per_gpu": 14.75,
+      "tokens/trainable": 4306095
+    },
+    {
+      "epoch": 0.8145454545454546,
+      "grad_norm": 0.015523642301559448,
+      "learning_rate": 0.00019760347769100987,
+      "loss": 0.0004476988688111305,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00045,
+      "step": 840,
+      "tokens/total": 13762560,
+      "tokens/train_per_sec_per_gpu": 14.14,
+      "tokens/trainable": 4357442
+    },
+    {
+      "epoch": 0.8242424242424242,
+      "grad_norm": 0.013460986316204071,
+      "learning_rate": 0.00019745390190703565,
+      "loss": 0.0004673306830227375,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 850,
+      "tokens/total": 13926400,
+      "tokens/train_per_sec_per_gpu": 14.1,
+      "tokens/trainable": 4409277
+    },
+    {
+      "epoch": 0.833939393939394,
+      "grad_norm": 0.0014691110700368881,
+      "learning_rate": 0.0001972998586560169,
+      "loss": 0.0003277578856796026,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 860,
+      "tokens/total": 14090240,
+      "tokens/train_per_sec_per_gpu": 14.28,
+      "tokens/trainable": 4460714
+    },
+    {
+      "epoch": 0.8436363636363636,
+      "grad_norm": 0.001358041656203568,
+      "learning_rate": 0.00019714135499958112,
+      "loss": 0.00032470382284373046,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 870,
+      "tokens/total": 14254080,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4511989
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.04510723799467087,
+      "learning_rate": 0.0001969783982038289,
+      "loss": 0.00023182881996035575,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 880,
+      "tokens/total": 14417920,
+      "tokens/train_per_sec_per_gpu": 15.41,
+      "tokens/trainable": 4563354
+    },
+    {
+      "epoch": 0.863030303030303,
+      "grad_norm": 0.14508692920207977,
+      "learning_rate": 0.00019681099573900113,
+      "loss": 0.00026136748492717744,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 890,
+      "tokens/total": 14581760,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 4615691
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 0.010969490744173527,
+      "learning_rate": 0.00019663915527913625,
+      "loss": 0.00016044279327616097,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 900,
+      "tokens/total": 14745600,
+      "tokens/train_per_sec_per_gpu": 15.76,
+      "tokens/trainable": 4667433
+    },
+    {
+      "epoch": 0.8824242424242424,
+      "grad_norm": 0.03874114155769348,
+      "learning_rate": 0.00019646288470171868,
+      "loss": 0.0004159804433584213,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 910,
+      "tokens/total": 14909440,
+      "tokens/train_per_sec_per_gpu": 16.01,
+      "tokens/trainable": 4719807
+    },
+    {
+      "epoch": 0.8921212121212121,
+      "grad_norm": 0.044620465487241745,
+      "learning_rate": 0.00019628219208731756,
+      "loss": 0.0006739750038832426,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 920,
+      "tokens/total": 15073280,
+      "tokens/train_per_sec_per_gpu": 15.05,
+      "tokens/trainable": 4771772
+    },
+    {
+      "epoch": 0.9018181818181819,
+      "grad_norm": 0.024856949225068092,
+      "learning_rate": 0.00019609708571921645,
+      "loss": 0.00039347023703157903,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 930,
+      "tokens/total": 15237120,
+      "tokens/train_per_sec_per_gpu": 15.16,
+      "tokens/trainable": 4823415
+    },
+    {
+      "epoch": 0.9115151515151515,
+      "grad_norm": 0.022198157384991646,
+      "learning_rate": 0.0001959075740830335,
+      "loss": 0.0005907822400331497,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00059,
+      "step": 940,
+      "tokens/total": 15400960,
+      "tokens/train_per_sec_per_gpu": 15.36,
+      "tokens/trainable": 4875269
+    },
+    {
+      "epoch": 0.9212121212121213,
+      "grad_norm": 0.01670038513839245,
+      "learning_rate": 0.00019571366586633245,
+      "loss": 0.00027316866908222437,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 950,
+      "tokens/total": 15564800,
+      "tokens/train_per_sec_per_gpu": 15.11,
+      "tokens/trainable": 4927244
+    },
+    {
+      "epoch": 0.9309090909090909,
+      "grad_norm": 0.021392742171883583,
+      "learning_rate": 0.00019551536995822454,
+      "loss": 0.0004320886451750994,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00043,
+      "step": 960,
+      "tokens/total": 15728640,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 4979068
+    },
+    {
+      "epoch": 0.9406060606060606,
+      "grad_norm": 0.028143158182501793,
+      "learning_rate": 0.00019531269544896076,
+      "loss": 0.0005637989845126868,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00056,
+      "step": 970,
+      "tokens/total": 15892480,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 5030980
+    },
+    {
+      "epoch": 0.9503030303030303,
+      "grad_norm": 0.077091746032238,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.0010597245767712594,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00106,
+      "step": 980,
+      "tokens/total": 16056320,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 5082759
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.04455556347966194,
+      "learning_rate": 0.00019489424799115984,
+      "loss": 0.0009517236612737179,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00095,
+      "step": 990,
+      "tokens/total": 16220160,
+      "tokens/train_per_sec_per_gpu": 13.04,
+      "tokens/trainable": 5134379
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 0.03573840856552124,
+      "learning_rate": 0.00019467849422502784,
+      "loss": 0.0008812972344458103,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00088,
+      "step": 1000,
+      "tokens/total": 16384000,
+      "tokens/train_per_sec_per_gpu": 15.23,
+      "tokens/trainable": 5186184
+    },
+    {
+      "epoch": 0.9793939393939394,
+      "grad_norm": 0.0006549305398948491,
+      "learning_rate": 0.0001944584002216709,
+      "loss": 0.0006358013488352299,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1010,
+      "tokens/total": 16547840,
+      "tokens/train_per_sec_per_gpu": 16.1,
+      "tokens/trainable": 5238320
+    },
+    {
+      "epoch": 0.9890909090909091,
+      "grad_norm": 0.021742813289165497,
+      "learning_rate": 0.00019423397607060507,
+      "loss": 0.000400003744289279,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1020,
+      "tokens/total": 16711680,
+      "tokens/train_per_sec_per_gpu": 14.53,
+      "tokens/trainable": 5290445
+    },
+    {
+      "epoch": 0.9987878787878788,
+      "grad_norm": 0.04323820024728775,
+      "learning_rate": 0.00019400523205984833,
+      "loss": 0.0002954686991870403,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1030,
+      "tokens/total": 16875520,
+      "tokens/train_per_sec_per_gpu": 14.98,
+      "tokens/trainable": 5342720
+    },
+    {
+      "epoch": 1.001939393939394,
+      "eval_loss": 0.00047458006883971393,
+      "eval_ppl": 1.00047,
+      "eval_runtime": 11.7938,
+      "eval_samples_per_second": 16.958,
+      "eval_steps_per_second": 8.479,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.73,
+      "memory/max_allocated (GiB)": 16.73,
+      "step": 1032
+    },
+    {
+      "epoch": 1.0096969696969698,
+      "grad_norm": 0.000988126266747713,
+      "learning_rate": 0.00019377217867544907,
+      "loss": 0.0004762394353747368,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00048,
+      "step": 1040,
+      "tokens/total": 17051648,
+      "tokens/train_per_sec_per_gpu": 14.47,
+      "tokens/trainable": 5398184
+    },
+    {
+      "epoch": 1.0193939393939393,
+      "grad_norm": 0.0011711094994097948,
+      "learning_rate": 0.00019353482660100537,
+      "loss": 0.00022675264626741408,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1050,
+      "tokens/total": 17215488,
+      "tokens/train_per_sec_per_gpu": 14.05,
+      "tokens/trainable": 5450329
+    },
+    {
+      "epoch": 1.029090909090909,
+      "grad_norm": 0.007319436874240637,
+      "learning_rate": 0.0001932931867171751,
+      "loss": 0.0003059083363041282,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1060,
+      "tokens/total": 17379328,
+      "tokens/train_per_sec_per_gpu": 13.66,
+      "tokens/trainable": 5502706
+    },
+    {
+      "epoch": 1.0387878787878788,
+      "grad_norm": 0.00967186689376831,
+      "learning_rate": 0.0001930472701011773,
+      "loss": 0.0003639918984845281,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1070,
+      "tokens/total": 17543168,
+      "tokens/train_per_sec_per_gpu": 15.36,
+      "tokens/trainable": 5554957
+    },
+    {
+      "epoch": 1.0484848484848486,
+      "grad_norm": 0.0018478024285286665,
+      "learning_rate": 0.00019279708802628437,
+      "loss": 0.0002576910424977541,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1080,
+      "tokens/total": 17707008,
+      "tokens/train_per_sec_per_gpu": 14.73,
+      "tokens/trainable": 5607534
+    },
+    {
+      "epoch": 1.0581818181818181,
+      "grad_norm": 0.018235478550195694,
+      "learning_rate": 0.00019254265196130517,
+      "loss": 0.0003647733014076948,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1090,
+      "tokens/total": 17870848,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 5659689
+    },
+    {
+      "epoch": 1.0678787878787879,
+      "grad_norm": 0.024314021691679955,
+      "learning_rate": 0.0001922839735700593,
+      "loss": 0.00030459570698440077,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1100,
+      "tokens/total": 18034688,
+      "tokens/train_per_sec_per_gpu": 13.67,
+      "tokens/trainable": 5711346
+    },
+    {
+      "epoch": 1.0775757575757576,
+      "grad_norm": 0.0177497286349535,
+      "learning_rate": 0.0001920210647108425,
+      "loss": 0.00023341022897511722,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1110,
+      "tokens/total": 18198528,
+      "tokens/train_per_sec_per_gpu": 14.13,
+      "tokens/trainable": 5763094
+    },
+    {
+      "epoch": 1.0872727272727274,
+      "grad_norm": 0.005781313870102167,
+      "learning_rate": 0.00019175393743588295,
+      "loss": 0.0002974884817376733,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1120,
+      "tokens/total": 18362368,
+      "tokens/train_per_sec_per_gpu": 14.55,
+      "tokens/trainable": 5815101
+    },
+    {
+      "epoch": 1.096969696969697,
+      "grad_norm": 0.0026403339579701424,
+      "learning_rate": 0.00019148260399078887,
+      "loss": 0.00010604445124045015,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 1130,
+      "tokens/total": 18526208,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 5866763
+    },
+    {
+      "epoch": 1.1066666666666667,
+      "grad_norm": 0.03586777299642563,
+      "learning_rate": 0.000191207076813987,
+      "loss": 0.00027820770628750324,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00028,
+      "step": 1140,
+      "tokens/total": 18690048,
+      "tokens/train_per_sec_per_gpu": 13.83,
+      "tokens/trainable": 5918322
+    },
+    {
+      "epoch": 1.1163636363636364,
+      "grad_norm": 0.007715190295130014,
+      "learning_rate": 0.00019092736853615257,
+      "loss": 0.00029321699403226373,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 1150,
+      "tokens/total": 18853888,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 5970153
+    },
+    {
+      "epoch": 1.126060606060606,
+      "grad_norm": 0.05122547224164009,
+      "learning_rate": 0.00019064349197963013,
+      "loss": 0.0005070990417152643,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1160,
+      "tokens/total": 19017728,
+      "tokens/train_per_sec_per_gpu": 15.51,
+      "tokens/trainable": 6021741
+    },
+    {
+      "epoch": 1.1357575757575757,
+      "grad_norm": 0.032420564442873,
+      "learning_rate": 0.000190355460157846,
+      "loss": 0.00031497194431722163,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1170,
+      "tokens/total": 19181568,
+      "tokens/train_per_sec_per_gpu": 16.05,
+      "tokens/trainable": 6074092
+    },
+    {
+      "epoch": 1.1454545454545455,
+      "grad_norm": 0.03688061609864235,
+      "learning_rate": 0.00019006328627471132,
+      "loss": 0.0003225028282031417,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1180,
+      "tokens/total": 19345408,
+      "tokens/train_per_sec_per_gpu": 14.1,
+      "tokens/trainable": 6126315
+    },
+    {
+      "epoch": 1.1551515151515153,
+      "grad_norm": 0.03359396383166313,
+      "learning_rate": 0.00018976698372401716,
+      "loss": 0.0004557626787573099,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 1190,
+      "tokens/total": 19509248,
+      "tokens/train_per_sec_per_gpu": 14.6,
+      "tokens/trainable": 6178392
+    },
+    {
+      "epoch": 1.1648484848484848,
+      "grad_norm": 0.020522581413388252,
+      "learning_rate": 0.0001894665660888202,
+      "loss": 0.0006435967981815339,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1200,
+      "tokens/total": 19673088,
+      "tokens/train_per_sec_per_gpu": 15.47,
+      "tokens/trainable": 6230984
+    },
+    {
+      "epoch": 1.1745454545454546,
+      "grad_norm": 0.0025893959682434797,
+      "learning_rate": 0.00018916204714082034,
+      "loss": 0.0005178887862712145,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00052,
+      "step": 1210,
+      "tokens/total": 19836928,
+      "tokens/train_per_sec_per_gpu": 14.13,
+      "tokens/trainable": 6282713
+    },
+    {
+      "epoch": 1.1842424242424243,
+      "grad_norm": 0.017288153991103172,
+      "learning_rate": 0.00018885344083972914,
+      "loss": 0.0005050559528172016,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1220,
+      "tokens/total": 20000768,
+      "tokens/train_per_sec_per_gpu": 14.31,
+      "tokens/trainable": 6334555
+    },
+    {
+      "epoch": 1.1939393939393939,
+      "grad_norm": 0.00206086877733469,
+      "learning_rate": 0.00018854076133263003,
+      "loss": 0.00020185327157378196,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 1230,
+      "tokens/total": 20164608,
+      "tokens/train_per_sec_per_gpu": 14.72,
+      "tokens/trainable": 6386137
+    },
+    {
+      "epoch": 1.2036363636363636,
+      "grad_norm": 0.02184407040476799,
+      "learning_rate": 0.0001882240229533297,
+      "loss": 0.00048260441981256007,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00048,
+      "step": 1240,
+      "tokens/total": 20328448,
+      "tokens/train_per_sec_per_gpu": 14.35,
+      "tokens/trainable": 6437493
+    },
+    {
+      "epoch": 1.2133333333333334,
+      "grad_norm": 0.04215926304459572,
+      "learning_rate": 0.00018790324022170118,
+      "loss": 0.0003190681803971529,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1250,
+      "tokens/total": 20492288,
+      "tokens/train_per_sec_per_gpu": 14.51,
+      "tokens/trainable": 6488834
+    },
+    {
+      "epoch": 1.2230303030303031,
+      "grad_norm": 0.006890668533742428,
+      "learning_rate": 0.00018757842784301784,
+      "loss": 0.0005027144681662322,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0005,
+      "step": 1260,
+      "tokens/total": 20656128,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 6540606
+    },
+    {
+      "epoch": 1.2327272727272727,
+      "grad_norm": 0.005489532835781574,
+      "learning_rate": 0.00018724960070727972,
+      "loss": 0.0006080259568989277,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00061,
+      "step": 1270,
+      "tokens/total": 20819968,
+      "tokens/train_per_sec_per_gpu": 13.92,
+      "tokens/trainable": 6592727
+    },
+    {
+      "epoch": 1.2424242424242424,
+      "grad_norm": 0.005877023097127676,
+      "learning_rate": 0.00018691677388853068,
+      "loss": 0.0006749071180820465,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 1280,
+      "tokens/total": 20983808,
+      "tokens/train_per_sec_per_gpu": 14.93,
+      "tokens/trainable": 6645179
+    },
+    {
+      "epoch": 1.2521212121212122,
+      "grad_norm": 0.0061390516348183155,
+      "learning_rate": 0.00018657996264416745,
+      "loss": 0.0002642946550622582,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1290,
+      "tokens/total": 21147648,
+      "tokens/train_per_sec_per_gpu": 14.92,
+      "tokens/trainable": 6697406
+    },
+    {
+      "epoch": 1.2618181818181817,
+      "grad_norm": 0.03444842994213104,
+      "learning_rate": 0.0001862391824142402,
+      "loss": 0.0004464905709028244,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00045,
+      "step": 1300,
+      "tokens/total": 21311488,
+      "tokens/train_per_sec_per_gpu": 15.07,
+      "tokens/trainable": 6749589
+    },
+    {
+      "epoch": 1.2715151515151515,
+      "grad_norm": 0.0036635284777730703,
+      "learning_rate": 0.00018589444882074474,
+      "loss": 0.0002096141455695033,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00021,
+      "step": 1310,
+      "tokens/total": 21475328,
+      "tokens/train_per_sec_per_gpu": 13.69,
+      "tokens/trainable": 6801799
+    },
+    {
+      "epoch": 1.2812121212121212,
+      "grad_norm": 0.003200239036232233,
+      "learning_rate": 0.00018554577766690636,
+      "loss": 0.00026335257571190595,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 1320,
+      "tokens/total": 21639168,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 6854205
+    },
+    {
+      "epoch": 1.290909090909091,
+      "grad_norm": 0.00109296350274235,
+      "learning_rate": 0.0001851931849364554,
+      "loss": 0.0003910743165761232,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1330,
+      "tokens/total": 21803008,
+      "tokens/train_per_sec_per_gpu": 14.96,
+      "tokens/trainable": 6906145
+    },
+    {
+      "epoch": 1.3006060606060605,
+      "grad_norm": 0.0006913666147738695,
+      "learning_rate": 0.00018483668679289452,
+      "loss": 0.0003079640679061413,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1340,
+      "tokens/total": 21966848,
+      "tokens/train_per_sec_per_gpu": 15.13,
+      "tokens/trainable": 6957405
+    },
+    {
+      "epoch": 1.3103030303030303,
+      "grad_norm": 0.03036116063594818,
+      "learning_rate": 0.00018447629957875776,
+      "loss": 0.0003281526267528534,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 1350,
+      "tokens/total": 22130688,
+      "tokens/train_per_sec_per_gpu": 15.08,
+      "tokens/trainable": 7009256
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.012580045498907566,
+      "learning_rate": 0.00018411203981486134,
+      "loss": 0.0006514057982712984,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1360,
+      "tokens/total": 22294528,
+      "tokens/train_per_sec_per_gpu": 14.66,
+      "tokens/trainable": 7060734
+    },
+    {
+      "epoch": 1.3296969696969696,
+      "grad_norm": 0.00828342791646719,
+      "learning_rate": 0.00018374392419954628,
+      "loss": 0.0003020781092345715,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 1370,
+      "tokens/total": 22458368,
+      "tokens/train_per_sec_per_gpu": 15.09,
+      "tokens/trainable": 7112415
+    },
+    {
+      "epoch": 1.3393939393939394,
+      "grad_norm": 0.09482505917549133,
+      "learning_rate": 0.00018337196960791302,
+      "loss": 0.0006797847803682089,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 1380,
+      "tokens/total": 22622208,
+      "tokens/train_per_sec_per_gpu": 15.03,
+      "tokens/trainable": 7164110
+    },
+    {
+      "epoch": 1.3490909090909091,
+      "grad_norm": 0.04534842446446419,
+      "learning_rate": 0.00018299619309104773,
+      "loss": 0.000729580270126462,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00073,
+      "step": 1390,
+      "tokens/total": 22786048,
+      "tokens/train_per_sec_per_gpu": 15.49,
+      "tokens/trainable": 7215797
+    },
+    {
+      "epoch": 1.3587878787878789,
+      "grad_norm": 0.010737202130258083,
+      "learning_rate": 0.00018261661187524072,
+      "loss": 0.0007514740340411663,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00075,
+      "step": 1400,
+      "tokens/total": 22949888,
+      "tokens/train_per_sec_per_gpu": 14.14,
+      "tokens/trainable": 7267691
+    },
+    {
+      "epoch": 1.3684848484848484,
+      "grad_norm": 0.05600081756711006,
+      "learning_rate": 0.00018223324336119672,
+      "loss": 0.001420076284557581,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00142,
+      "step": 1410,
+      "tokens/total": 23113728,
+      "tokens/train_per_sec_per_gpu": 15.3,
+      "tokens/trainable": 7319876
+    },
+    {
+      "epoch": 1.3781818181818182,
+      "grad_norm": 0.019460471346974373,
+      "learning_rate": 0.00018184610512323718,
+      "loss": 0.0022406818345189093,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00224,
+      "step": 1420,
+      "tokens/total": 23277568,
+      "tokens/train_per_sec_per_gpu": 14.38,
+      "tokens/trainable": 7371762
+    },
+    {
+      "epoch": 1.387878787878788,
+      "grad_norm": 0.03277068957686424,
+      "learning_rate": 0.00018145521490849477,
+      "loss": 0.000915923435240984,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00092,
+      "step": 1430,
+      "tokens/total": 23441408,
+      "tokens/train_per_sec_per_gpu": 14.66,
+      "tokens/trainable": 7423685
+    },
+    {
+      "epoch": 1.3975757575757575,
+      "grad_norm": 0.0156385600566864,
+      "learning_rate": 0.0001810605906360996,
+      "loss": 0.000897888746112585,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0009,
+      "step": 1440,
+      "tokens/total": 23605248,
+      "tokens/train_per_sec_per_gpu": 13.99,
+      "tokens/trainable": 7476266
+    },
+    {
+      "epoch": 1.4072727272727272,
+      "grad_norm": 0.01643913984298706,
+      "learning_rate": 0.00018066225039635794,
+      "loss": 0.000922933965921402,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00092,
+      "step": 1450,
+      "tokens/total": 23769088,
+      "tokens/train_per_sec_per_gpu": 14.57,
+      "tokens/trainable": 7528208
+    },
+    {
+      "epoch": 1.416969696969697,
+      "grad_norm": 0.024322666227817535,
+      "learning_rate": 0.00018026021244992287,
+      "loss": 0.0011652217246592045,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00117,
+      "step": 1460,
+      "tokens/total": 23932928,
+      "tokens/train_per_sec_per_gpu": 13.91,
+      "tokens/trainable": 7580038
+    },
+    {
+      "epoch": 1.4266666666666667,
+      "grad_norm": 0.05165834724903107,
+      "learning_rate": 0.0001798544952269572,
+      "loss": 0.0009731135331094265,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 1470,
+      "tokens/total": 24096768,
+      "tokens/train_per_sec_per_gpu": 14.56,
+      "tokens/trainable": 7631772
+    },
+    {
+      "epoch": 1.4363636363636363,
+      "grad_norm": 0.02529827691614628,
+      "learning_rate": 0.0001794451173262885,
+      "loss": 0.0005802253726869822,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00058,
+      "step": 1480,
+      "tokens/total": 24260608,
+      "tokens/train_per_sec_per_gpu": 13.72,
+      "tokens/trainable": 7683048
+    },
+    {
+      "epoch": 1.446060606060606,
+      "grad_norm": 0.0670745000243187,
+      "learning_rate": 0.00017903209751455665,
+      "loss": 0.000642474414780736,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00064,
+      "step": 1490,
+      "tokens/total": 24424448,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 7735332
+    },
+    {
+      "epoch": 1.4557575757575758,
+      "grad_norm": 0.02367187850177288,
+      "learning_rate": 0.00017861545472535348,
+      "loss": 0.00032834114972501993,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 1500,
+      "tokens/total": 24588288,
+      "tokens/train_per_sec_per_gpu": 16.37,
+      "tokens/trainable": 7787186
+    },
+    {
+      "epoch": 1.4654545454545453,
+      "grad_norm": 0.011678172275424004,
+      "learning_rate": 0.00017819520805835475,
+      "loss": 0.0009690596722066403,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00097,
+      "step": 1510,
+      "tokens/total": 24752128,
+      "tokens/train_per_sec_per_gpu": 13.55,
+      "tokens/trainable": 7838878
+    },
+    {
+      "epoch": 1.475151515151515,
+      "grad_norm": 0.05298800393939018,
+      "learning_rate": 0.00017777137677844461,
+      "loss": 0.0009098535403609276,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00091,
+      "step": 1520,
+      "tokens/total": 24915968,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 7890631
+    },
+    {
+      "epoch": 1.4848484848484849,
+      "grad_norm": 0.037918779999017715,
+      "learning_rate": 0.00017734398031483265,
+      "loss": 0.0006457697600126266,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1530,
+      "tokens/total": 25079808,
+      "tokens/train_per_sec_per_gpu": 13.25,
+      "tokens/trainable": 7942366
+    },
+    {
+      "epoch": 1.4945454545454546,
+      "grad_norm": 0.02729674056172371,
+      "learning_rate": 0.0001769130382601629,
+      "loss": 0.0009943137876689434,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00099,
+      "step": 1540,
+      "tokens/total": 25243648,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 7994307
+    },
+    {
+      "epoch": 1.5023030303030303,
+      "eval_loss": 0.0006865999894216657,
+      "eval_ppl": 1.00069,
+      "eval_runtime": 12.127,
+      "eval_samples_per_second": 16.492,
+      "eval_steps_per_second": 8.246,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "step": 1548
+    },
+    {
+      "epoch": 1.5042424242424244,
+      "grad_norm": 0.053267233073711395,
+      "learning_rate": 0.00017647857036961592,
+      "loss": 0.0006284893956035375,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00063,
+      "step": 1550,
+      "tokens/total": 25407488,
+      "tokens/train_per_sec_per_gpu": 14.87,
+      "tokens/trainable": 8046124
+    },
+    {
+      "epoch": 1.513939393939394,
+      "grad_norm": 0.05232734978199005,
+      "learning_rate": 0.0001760405965600031,
+      "loss": 0.0005064161494374275,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00051,
+      "step": 1560,
+      "tokens/total": 25571328,
+      "tokens/train_per_sec_per_gpu": 14.39,
+      "tokens/trainable": 8098367
+    },
+    {
+      "epoch": 1.5236363636363637,
+      "grad_norm": 0.015440079383552074,
+      "learning_rate": 0.00017559913690885364,
+      "loss": 0.0004742793273180723,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 1570,
+      "tokens/total": 25735168,
+      "tokens/train_per_sec_per_gpu": 14.19,
+      "tokens/trainable": 8150005
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.005799058359116316,
+      "learning_rate": 0.00017515421165349414,
+      "loss": 0.0005522690713405609,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00055,
+      "step": 1580,
+      "tokens/total": 25899008,
+      "tokens/train_per_sec_per_gpu": 14.94,
+      "tokens/trainable": 8201985
+    },
+    {
+      "epoch": 1.543030303030303,
+      "grad_norm": 0.025745827704668045,
+      "learning_rate": 0.00017470584119012094,
+      "loss": 0.0004415466450154781,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1590,
+      "tokens/total": 26062848,
+      "tokens/train_per_sec_per_gpu": 14.76,
+      "tokens/trainable": 8253407
+    },
+    {
+      "epoch": 1.5527272727272727,
+      "grad_norm": 0.006111942231655121,
+      "learning_rate": 0.00017425404607286508,
+      "loss": 0.0004033858887851238,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1600,
+      "tokens/total": 26226688,
+      "tokens/train_per_sec_per_gpu": 13.45,
+      "tokens/trainable": 8305596
+    },
+    {
+      "epoch": 1.5624242424242425,
+      "grad_norm": 0.01315031573176384,
+      "learning_rate": 0.00017379884701285,
+      "loss": 0.0006456051021814346,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00065,
+      "step": 1610,
+      "tokens/total": 26390528,
+      "tokens/train_per_sec_per_gpu": 15.34,
+      "tokens/trainable": 8357648
+    },
+    {
+      "epoch": 1.5721212121212123,
+      "grad_norm": 0.002383842132985592,
+      "learning_rate": 0.00017334026487724225,
+      "loss": 0.00028960562776774167,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 1620,
+      "tokens/total": 26554368,
+      "tokens/train_per_sec_per_gpu": 14.29,
+      "tokens/trainable": 8410056
+    },
+    {
+      "epoch": 1.5818181818181818,
+      "grad_norm": 0.006294222082942724,
+      "learning_rate": 0.0001728783206882948,
+      "loss": 0.00025043871719390156,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 1630,
+      "tokens/total": 26718208,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 8461798
+    },
+    {
+      "epoch": 1.5915151515151515,
+      "grad_norm": 8.702854393050075e-05,
+      "learning_rate": 0.00017241303562238336,
+      "loss": 0.00012461008736863732,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 1640,
+      "tokens/total": 26882048,
+      "tokens/train_per_sec_per_gpu": 15.61,
+      "tokens/trainable": 8514035
+    },
+    {
+      "epoch": 1.601212121212121,
+      "grad_norm": 0.07624056935310364,
+      "learning_rate": 0.00017194443100903558,
+      "loss": 0.00024855402298271654,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 1650,
+      "tokens/total": 27045888,
+      "tokens/train_per_sec_per_gpu": 14.48,
+      "tokens/trainable": 8565875
+    },
+    {
+      "epoch": 1.6109090909090908,
+      "grad_norm": 0.02497026138007641,
+      "learning_rate": 0.00017147252832995337,
+      "loss": 0.00044286823831498625,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1660,
+      "tokens/total": 27209728,
+      "tokens/train_per_sec_per_gpu": 14.47,
+      "tokens/trainable": 8617912
+    },
+    {
+      "epoch": 1.6206060606060606,
+      "grad_norm": 0.0016530955908820033,
+      "learning_rate": 0.00017099734921802802,
+      "loss": 0.0003104714211076498,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1670,
+      "tokens/total": 27373568,
+      "tokens/train_per_sec_per_gpu": 13.53,
+      "tokens/trainable": 8669875
+    },
+    {
+      "epoch": 1.6303030303030304,
+      "grad_norm": 0.02621961385011673,
+      "learning_rate": 0.00017051891545634854,
+      "loss": 0.0004010321106761694,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 1680,
+      "tokens/total": 27537408,
+      "tokens/train_per_sec_per_gpu": 16.09,
+      "tokens/trainable": 8721709
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.043721288442611694,
+      "learning_rate": 0.00017003724897720316,
+      "loss": 0.00042473864741623404,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 1690,
+      "tokens/total": 27701248,
+      "tokens/train_per_sec_per_gpu": 14.84,
+      "tokens/trainable": 8773762
+    },
+    {
+      "epoch": 1.6496969696969697,
+      "grad_norm": 0.01791808009147644,
+      "learning_rate": 0.00016955237186107387,
+      "loss": 0.0003858121577650309,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1700,
+      "tokens/total": 27865088,
+      "tokens/train_per_sec_per_gpu": 14.87,
+      "tokens/trainable": 8825435
+    },
+    {
+      "epoch": 1.6593939393939394,
+      "grad_norm": 0.017175329849123955,
+      "learning_rate": 0.0001690643063356241,
+      "loss": 0.0003785108681768179,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00038,
+      "step": 1710,
+      "tokens/total": 28028928,
+      "tokens/train_per_sec_per_gpu": 13.63,
+      "tokens/trainable": 8877227
+    },
+    {
+      "epoch": 1.669090909090909,
+      "grad_norm": 0.03429865464568138,
+      "learning_rate": 0.0001685730747746799,
+      "loss": 0.0003128159558400512,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1720,
+      "tokens/total": 28192768,
+      "tokens/train_per_sec_per_gpu": 13.42,
+      "tokens/trainable": 8928835
+    },
+    {
+      "epoch": 1.6787878787878787,
+      "grad_norm": 0.008623798377811909,
+      "learning_rate": 0.0001680786996972043,
+      "loss": 0.0008884714916348457,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00089,
+      "step": 1730,
+      "tokens/total": 28356608,
+      "tokens/train_per_sec_per_gpu": 14.8,
+      "tokens/trainable": 8979863
+    },
+    {
+      "epoch": 1.6884848484848485,
+      "grad_norm": 0.007137796841561794,
+      "learning_rate": 0.00016758120376626488,
+      "loss": 0.000342932902276516,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00034,
+      "step": 1740,
+      "tokens/total": 28520448,
+      "tokens/train_per_sec_per_gpu": 13.64,
+      "tokens/trainable": 9031317
+    },
+    {
+      "epoch": 1.6981818181818182,
+      "grad_norm": 0.006754934322088957,
+      "learning_rate": 0.00016708060978799493,
+      "loss": 0.00031610706355422735,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1750,
+      "tokens/total": 28684288,
+      "tokens/train_per_sec_per_gpu": 16.63,
+      "tokens/trainable": 9082925
+    },
+    {
+      "epoch": 1.707878787878788,
+      "grad_norm": 0.012158721685409546,
+      "learning_rate": 0.00016657694071054794,
+      "loss": 0.00039324900135397913,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00039,
+      "step": 1760,
+      "tokens/total": 28848128,
+      "tokens/train_per_sec_per_gpu": 14.31,
+      "tokens/trainable": 9134535
+    },
+    {
+      "epoch": 1.7175757575757575,
+      "grad_norm": 0.04653792828321457,
+      "learning_rate": 0.00016607021962304565,
+      "loss": 0.0003617320442572236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 1770,
+      "tokens/total": 29011968,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 9186666
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 0.009638557210564613,
+      "learning_rate": 0.00016556046975451963,
+      "loss": 0.00031410730443894865,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1780,
+      "tokens/total": 29175808,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 9238529
+    },
+    {
+      "epoch": 1.7369696969696968,
+      "grad_norm": 0.017064686864614487,
+      "learning_rate": 0.0001650477144728462,
+      "loss": 0.00043909624218940735,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 1790,
+      "tokens/total": 29339648,
+      "tokens/train_per_sec_per_gpu": 14.08,
+      "tokens/trainable": 9290289
+    },
+    {
+      "epoch": 1.7466666666666666,
+      "grad_norm": 0.0022802259773015976,
+      "learning_rate": 0.00016453197728367563,
+      "loss": 0.00032380607444792986,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 1800,
+      "tokens/total": 29503488,
+      "tokens/train_per_sec_per_gpu": 13.73,
+      "tokens/trainable": 9341953
+    },
+    {
+      "epoch": 1.7563636363636363,
+      "grad_norm": 0.0036841712426394224,
+      "learning_rate": 0.00016401328182935417,
+      "loss": 0.0006712255533784627,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00067,
+      "step": 1810,
+      "tokens/total": 29667328,
+      "tokens/train_per_sec_per_gpu": 16.36,
+      "tokens/trainable": 9393126
+    },
+    {
+      "epoch": 1.766060606060606,
+      "grad_norm": 0.0006454121321439743,
+      "learning_rate": 0.0001634916518878404,
+      "loss": 0.00010477005271241069,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 1820,
+      "tokens/total": 29831168,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 9444494
+    },
+    {
+      "epoch": 1.7757575757575759,
+      "grad_norm": 0.035474907606840134,
+      "learning_rate": 0.00016296711137161535,
+      "loss": 0.00034273902419954536,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00034,
+      "step": 1830,
+      "tokens/total": 29995008,
+      "tokens/train_per_sec_per_gpu": 14.78,
+      "tokens/trainable": 9496432
+    },
+    {
+      "epoch": 1.7854545454545454,
+      "grad_norm": 0.0042278701439499855,
+      "learning_rate": 0.00016243968432658605,
+      "loss": 0.0004896576981991529,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 1840,
+      "tokens/total": 30158848,
+      "tokens/train_per_sec_per_gpu": 15.01,
+      "tokens/trainable": 9547913
+    },
+    {
+      "epoch": 1.7951515151515152,
+      "grad_norm": 0.008337569423019886,
+      "learning_rate": 0.00016190939493098344,
+      "loss": 0.0003711160738021135,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00037,
+      "step": 1850,
+      "tokens/total": 30322688,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 9599023
+    },
+    {
+      "epoch": 1.8048484848484847,
+      "grad_norm": 0.033457424491643906,
+      "learning_rate": 0.00016137626749425377,
+      "loss": 0.0005191094242036343,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00052,
+      "step": 1860,
+      "tokens/total": 30486528,
+      "tokens/train_per_sec_per_gpu": 14.35,
+      "tokens/trainable": 9651048
+    },
+    {
+      "epoch": 1.8145454545454545,
+      "grad_norm": 0.014811063185334206,
+      "learning_rate": 0.0001608403264559445,
+      "loss": 0.0002689486602321267,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 1870,
+      "tokens/total": 30650368,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 9703354
+    },
+    {
+      "epoch": 1.8242424242424242,
+      "grad_norm": 0.011829032562673092,
+      "learning_rate": 0.00016030159638458376,
+      "loss": 0.0003055253764614463,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00031,
+      "step": 1880,
+      "tokens/total": 30814208,
+      "tokens/train_per_sec_per_gpu": 14.05,
+      "tokens/trainable": 9755371
+    },
+    {
+      "epoch": 1.833939393939394,
+      "grad_norm": 0.003898326540365815,
+      "learning_rate": 0.00015976010197655397,
+      "loss": 0.00023026440758258104,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 1890,
+      "tokens/total": 30978048,
+      "tokens/train_per_sec_per_gpu": 13.89,
+      "tokens/trainable": 9807011
+    },
+    {
+      "epoch": 1.8436363636363637,
+      "grad_norm": 0.00993694830685854,
+      "learning_rate": 0.00015921586805496004,
+      "loss": 0.000414779270067811,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00041,
+      "step": 1900,
+      "tokens/total": 31141888,
+      "tokens/train_per_sec_per_gpu": 14.42,
+      "tokens/trainable": 9859849
+    },
+    {
+      "epoch": 1.8533333333333335,
+      "grad_norm": 0.00715588079765439,
+      "learning_rate": 0.0001586689195684911,
+      "loss": 0.0004666011780500412,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00047,
+      "step": 1910,
+      "tokens/total": 31305728,
+      "tokens/train_per_sec_per_gpu": 14.16,
+      "tokens/trainable": 9911712
+    },
+    {
+      "epoch": 1.863030303030303,
+      "grad_norm": 0.021137356758117676,
+      "learning_rate": 0.000158119281590277,
+      "loss": 0.00046254890039563177,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00046,
+      "step": 1920,
+      "tokens/total": 31469568,
+      "tokens/train_per_sec_per_gpu": 14.81,
+      "tokens/trainable": 9963813
+    },
+    {
+      "epoch": 1.8727272727272726,
+      "grad_norm": 0.0023340010084211826,
+      "learning_rate": 0.000157566979316739,
+      "loss": 0.0004919813480228185,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 1930,
+      "tokens/total": 31633408,
+      "tokens/train_per_sec_per_gpu": 15.8,
+      "tokens/trainable": 10015724
+    },
+    {
+      "epoch": 1.8824242424242423,
+      "grad_norm": 0.01151804905384779,
+      "learning_rate": 0.00015701203806643433,
+      "loss": 0.00023937469813972712,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00024,
+      "step": 1940,
+      "tokens/total": 31797248,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 10067073
+    },
+    {
+      "epoch": 1.892121212121212,
+      "grad_norm": 0.016535570845007896,
+      "learning_rate": 0.00015645448327889603,
+      "loss": 0.00021827330347150563,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 1950,
+      "tokens/total": 31961088,
+      "tokens/train_per_sec_per_gpu": 14.48,
+      "tokens/trainable": 10119393
+    },
+    {
+      "epoch": 1.9018181818181819,
+      "grad_norm": 0.0034130853600800037,
+      "learning_rate": 0.00015589434051346634,
+      "loss": 0.00017861993983387948,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00018,
+      "step": 1960,
+      "tokens/total": 32124928,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 10171930
+    },
+    {
+      "epoch": 1.9115151515151516,
+      "grad_norm": 0.02398502826690674,
+      "learning_rate": 0.0001553316354481253,
+      "loss": 0.00014141426654532552,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00014,
+      "step": 1970,
+      "tokens/total": 32288768,
+      "tokens/train_per_sec_per_gpu": 15.59,
+      "tokens/trainable": 10223639
+    },
+    {
+      "epoch": 1.9212121212121214,
+      "grad_norm": 0.0007365989149548113,
+      "learning_rate": 0.00015476639387831343,
+      "loss": 0.00011406640987843275,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 1980,
+      "tokens/total": 32452608,
+      "tokens/train_per_sec_per_gpu": 13.45,
+      "tokens/trainable": 10275019
+    },
+    {
+      "epoch": 1.930909090909091,
+      "grad_norm": 0.028317851945757866,
+      "learning_rate": 0.00015419864171574944,
+      "loss": 0.0004076042678207159,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00041,
+      "step": 1990,
+      "tokens/total": 32616448,
+      "tokens/train_per_sec_per_gpu": 14.68,
+      "tokens/trainable": 10327234
+    },
+    {
+      "epoch": 1.9406060606060604,
+      "grad_norm": 0.0007216805825009942,
+      "learning_rate": 0.00015362840498724215,
+      "loss": 0.0002287053968757391,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 2000,
+      "tokens/total": 32780288,
+      "tokens/train_per_sec_per_gpu": 14.77,
+      "tokens/trainable": 10379906
+    },
+    {
+      "epoch": 1.9503030303030302,
+      "grad_norm": 0.021391045302152634,
+      "learning_rate": 0.00015305570983349743,
+      "loss": 0.0006855262909084558,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00069,
+      "step": 2010,
+      "tokens/total": 32944128,
+      "tokens/train_per_sec_per_gpu": 13.75,
+      "tokens/trainable": 10431864
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.014411289244890213,
+      "learning_rate": 0.00015248058250792008,
+      "loss": 0.00020992583595216274,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00021,
+      "step": 2020,
+      "tokens/total": 33107968,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 10483503
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.0019180785166099668,
+      "learning_rate": 0.00015190304937540993,
+      "loss": 0.000295165297575295,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 2030,
+      "tokens/total": 33271808,
+      "tokens/train_per_sec_per_gpu": 15.32,
+      "tokens/trainable": 10534682
+    },
+    {
+      "epoch": 1.9793939393939395,
+      "grad_norm": 0.027906686067581177,
+      "learning_rate": 0.00015132313691115367,
+      "loss": 0.00030230602715164423,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 2040,
+      "tokens/total": 33435648,
+      "tokens/train_per_sec_per_gpu": 13.52,
+      "tokens/trainable": 10586848
+    },
+    {
+      "epoch": 1.9890909090909092,
+      "grad_norm": 0.030775317922234535,
+      "learning_rate": 0.00015074087169941085,
+      "loss": 0.00011671001557260752,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2050,
+      "tokens/total": 33599488,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 10638485
+    },
+    {
+      "epoch": 1.9987878787878788,
+      "grad_norm": 0.054577309638261795,
+      "learning_rate": 0.00015015628043229523,
+      "loss": 0.0003703285474330187,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00037,
+      "step": 2060,
+      "tokens/total": 33763328,
+      "tokens/train_per_sec_per_gpu": 14.81,
+      "tokens/trainable": 10689855
+    },
+    {
+      "epoch": 2.003878787878788,
+      "eval_loss": 0.00032737868605181575,
+      "eval_ppl": 1.00033,
+      "eval_runtime": 12.1345,
+      "eval_samples_per_second": 16.482,
+      "eval_steps_per_second": 8.241,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.73,
+      "memory/max_allocated (GiB)": 16.73,
+      "step": 2064
+    },
+    {
+      "epoch": 2.0096969696969698,
+      "grad_norm": 0.02574228309094906,
+      "learning_rate": 0.00014956938990855139,
+      "loss": 0.0006258985958993435,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00063,
+      "step": 2070,
+      "tokens/total": 33939456,
+      "tokens/train_per_sec_per_gpu": 15.27,
+      "tokens/trainable": 10745674
+    },
+    {
+      "epoch": 2.0193939393939395,
+      "grad_norm": 0.0003698334621731192,
+      "learning_rate": 0.00014898022703232604,
+      "loss": 0.00025913610588759186,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 2080,
+      "tokens/total": 34103296,
+      "tokens/train_per_sec_per_gpu": 14.61,
+      "tokens/trainable": 10797792
+    },
+    {
+      "epoch": 2.0290909090909093,
+      "grad_norm": 0.0033025413285940886,
+      "learning_rate": 0.00014838881881193468,
+      "loss": 0.0001973774516955018,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 2090,
+      "tokens/total": 34267136,
+      "tokens/train_per_sec_per_gpu": 14.68,
+      "tokens/trainable": 10849439
+    },
+    {
+      "epoch": 2.0387878787878786,
+      "grad_norm": 0.0001970751181943342,
+      "learning_rate": 0.00014779519235862365,
+      "loss": 0.00029088449664413927,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 2100,
+      "tokens/total": 34430976,
+      "tokens/train_per_sec_per_gpu": 14.21,
+      "tokens/trainable": 10902278
+    },
+    {
+      "epoch": 2.0484848484848484,
+      "grad_norm": 0.0011533941142261028,
+      "learning_rate": 0.00014719937488532706,
+      "loss": 0.00021680027712136506,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 2110,
+      "tokens/total": 34594816,
+      "tokens/train_per_sec_per_gpu": 14.83,
+      "tokens/trainable": 10954337
+    },
+    {
+      "epoch": 2.058181818181818,
+      "grad_norm": 0.0012934933183714747,
+      "learning_rate": 0.00014660139370541953,
+      "loss": 0.00015767107252031564,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 2120,
+      "tokens/total": 34758656,
+      "tokens/train_per_sec_per_gpu": 14.22,
+      "tokens/trainable": 11006253
+    },
+    {
+      "epoch": 2.067878787878788,
+      "grad_norm": 0.00458933599293232,
+      "learning_rate": 0.00014600127623146388,
+      "loss": 0.0001101671252399683,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 2130,
+      "tokens/total": 34922496,
+      "tokens/train_per_sec_per_gpu": 14.53,
+      "tokens/trainable": 11058062
+    },
+    {
+      "epoch": 2.0775757575757576,
+      "grad_norm": 0.0032617889810353518,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.00019488829420879483,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2140,
+      "tokens/total": 35086336,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 11109942
+    },
+    {
+      "epoch": 2.0872727272727274,
+      "grad_norm": 0.007860329002141953,
+      "learning_rate": 0.00014479474254005707,
+      "loss": 9.439463028684258e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00009,
+      "step": 2150,
+      "tokens/total": 35250176,
+      "tokens/train_per_sec_per_gpu": 15.29,
+      "tokens/trainable": 11161699
+    },
+    {
+      "epoch": 2.096969696969697,
+      "grad_norm": 0.0008931563934311271,
+      "learning_rate": 0.0001441883816323411,
+      "loss": 0.00016972824232652783,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00017,
+      "step": 2160,
+      "tokens/total": 35414016,
+      "tokens/train_per_sec_per_gpu": 15.51,
+      "tokens/trainable": 11213741
+    },
+    {
+      "epoch": 2.1066666666666665,
+      "grad_norm": 0.006945727858692408,
+      "learning_rate": 0.00014357999504751182,
+      "loss": 9.466245537623764e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00009,
+      "step": 2170,
+      "tokens/total": 35577856,
+      "tokens/train_per_sec_per_gpu": 14.46,
+      "tokens/trainable": 11265729
+    },
+    {
+      "epoch": 2.1163636363636362,
+      "grad_norm": 0.009756731800734997,
+      "learning_rate": 0.0001429696106751352,
+      "loss": 7.116884225979447e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00007,
+      "step": 2180,
+      "tokens/total": 35741696,
+      "tokens/train_per_sec_per_gpu": 14.9,
+      "tokens/trainable": 11318089
+    },
+    {
+      "epoch": 2.126060606060606,
+      "grad_norm": 0.003617421491071582,
+      "learning_rate": 0.00014235725649635933,
+      "loss": 0.00017703230259940027,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00018,
+      "step": 2190,
+      "tokens/total": 35905536,
+      "tokens/train_per_sec_per_gpu": 16.31,
+      "tokens/trainable": 11370159
+    },
+    {
+      "epoch": 2.1357575757575757,
+      "grad_norm": 0.0008388167480006814,
+      "learning_rate": 0.00014174296058263195,
+      "loss": 0.0002220547990873456,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 2200,
+      "tokens/total": 36069376,
+      "tokens/train_per_sec_per_gpu": 15.31,
+      "tokens/trainable": 11422568
+    },
+    {
+      "epoch": 2.1454545454545455,
+      "grad_norm": 0.03691717982292175,
+      "learning_rate": 0.00014112675109441352,
+      "loss": 0.00018518726574257016,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2210,
+      "tokens/total": 36233216,
+      "tokens/train_per_sec_per_gpu": 14.81,
+      "tokens/trainable": 11473971
+    },
+    {
+      "epoch": 2.1551515151515153,
+      "grad_norm": 0.0008130021742545068,
+      "learning_rate": 0.0001405086562798863,
+      "loss": 0.0001568903331644833,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 2220,
+      "tokens/total": 36397056,
+      "tokens/train_per_sec_per_gpu": 15.22,
+      "tokens/trainable": 11526106
+    },
+    {
+      "epoch": 2.164848484848485,
+      "grad_norm": 0.0014426361303776503,
+      "learning_rate": 0.00013988870447365933,
+      "loss": 0.00027461207937449215,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 2230,
+      "tokens/total": 36560896,
+      "tokens/train_per_sec_per_gpu": 15.95,
+      "tokens/trainable": 11578483
+    },
+    {
+      "epoch": 2.174545454545455,
+      "grad_norm": 0.029341408982872963,
+      "learning_rate": 0.00013926692409546964,
+      "loss": 0.0003196842735633254,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 2240,
+      "tokens/total": 36724736,
+      "tokens/train_per_sec_per_gpu": 16.15,
+      "tokens/trainable": 11630965
+    },
+    {
+      "epoch": 2.184242424242424,
+      "grad_norm": 0.00210795970633626,
+      "learning_rate": 0.00013864334364887943,
+      "loss": 0.0004162232857197523,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 2250,
+      "tokens/total": 36888576,
+      "tokens/train_per_sec_per_gpu": 15.03,
+      "tokens/trainable": 11682642
+    },
+    {
+      "epoch": 2.193939393939394,
+      "grad_norm": 0.003121949266642332,
+      "learning_rate": 0.0001380179917199692,
+      "loss": 0.00042150220833718776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00042,
+      "step": 2260,
+      "tokens/total": 37052416,
+      "tokens/train_per_sec_per_gpu": 15.47,
+      "tokens/trainable": 11734687
+    },
+    {
+      "epoch": 2.2036363636363636,
+      "grad_norm": 0.009584403596818447,
+      "learning_rate": 0.00013739089697602764,
+      "loss": 0.0003333257278427482,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 2270,
+      "tokens/total": 37216256,
+      "tokens/train_per_sec_per_gpu": 14.88,
+      "tokens/trainable": 11786194
+    },
+    {
+      "epoch": 2.2133333333333334,
+      "grad_norm": 0.0031741419807076454,
+      "learning_rate": 0.00013676208816423724,
+      "loss": 0.00011245617642998695,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 2280,
+      "tokens/total": 37380096,
+      "tokens/train_per_sec_per_gpu": 14.73,
+      "tokens/trainable": 11837550
+    },
+    {
+      "epoch": 2.223030303030303,
+      "grad_norm": 0.03865548223257065,
+      "learning_rate": 0.00013613159411035648,
+      "loss": 0.00020037838257849216,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 2290,
+      "tokens/total": 37543936,
+      "tokens/train_per_sec_per_gpu": 15.33,
+      "tokens/trainable": 11889401
+    },
+    {
+      "epoch": 2.232727272727273,
+      "grad_norm": 0.012145821005105972,
+      "learning_rate": 0.00013549944371739854,
+      "loss": 0.00011074641952291131,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 2300,
+      "tokens/total": 37707776,
+      "tokens/train_per_sec_per_gpu": 14.18,
+      "tokens/trainable": 11941616
+    },
+    {
+      "epoch": 2.242424242424242,
+      "grad_norm": 0.0009741581161506474,
+      "learning_rate": 0.00013486566596430623,
+      "loss": 0.00024885197635740044,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 2310,
+      "tokens/total": 37871616,
+      "tokens/train_per_sec_per_gpu": 14.86,
+      "tokens/trainable": 11993896
+    },
+    {
+      "epoch": 2.252121212121212,
+      "grad_norm": 0.011996032670140266,
+      "learning_rate": 0.00013423028990462344,
+      "loss": 0.0003463976550847292,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00035,
+      "step": 2320,
+      "tokens/total": 38035456,
+      "tokens/train_per_sec_per_gpu": 14.09,
+      "tokens/trainable": 12045275
+    },
+    {
+      "epoch": 2.2618181818181817,
+      "grad_norm": 0.021751079708337784,
+      "learning_rate": 0.0001335933446651636,
+      "loss": 0.0008397232741117477,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00084,
+      "step": 2330,
+      "tokens/total": 38199296,
+      "tokens/train_per_sec_per_gpu": 15.05,
+      "tokens/trainable": 12096897
+    },
+    {
+      "epoch": 2.2715151515151515,
+      "grad_norm": 0.02025892771780491,
+      "learning_rate": 0.00013295485944467405,
+      "loss": 0.0005815276876091957,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00058,
+      "step": 2340,
+      "tokens/total": 38363136,
+      "tokens/train_per_sec_per_gpu": 14.28,
+      "tokens/trainable": 12148443
+    },
+    {
+      "epoch": 2.2812121212121212,
+      "grad_norm": 0.028191884979605675,
+      "learning_rate": 0.0001323148635124978,
+      "loss": 0.00035780200269073246,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 2350,
+      "tokens/total": 38526976,
+      "tokens/train_per_sec_per_gpu": 14.5,
+      "tokens/trainable": 12200260
+    },
+    {
+      "epoch": 2.290909090909091,
+      "grad_norm": 0.018472714349627495,
+      "learning_rate": 0.00013167338620723165,
+      "loss": 0.0006046999711543322,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0006,
+      "step": 2360,
+      "tokens/total": 38690816,
+      "tokens/train_per_sec_per_gpu": 13.44,
+      "tokens/trainable": 12252405
+    },
+    {
+      "epoch": 2.3006060606060608,
+      "grad_norm": 0.018522929400205612,
+      "learning_rate": 0.00013103045693538135,
+      "loss": 0.000294373813085258,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00029,
+      "step": 2370,
+      "tokens/total": 38854656,
+      "tokens/train_per_sec_per_gpu": 15.42,
+      "tokens/trainable": 12304241
+    },
+    {
+      "epoch": 2.3103030303030305,
+      "grad_norm": 0.024094371125102043,
+      "learning_rate": 0.00013038610517001332,
+      "loss": 0.00027109310030937195,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00027,
+      "step": 2380,
+      "tokens/total": 39018496,
+      "tokens/train_per_sec_per_gpu": 14.73,
+      "tokens/trainable": 12356446
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.019156360998749733,
+      "learning_rate": 0.0001297403604494039,
+      "loss": 0.00016260554548352957,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 2390,
+      "tokens/total": 39182336,
+      "tokens/train_per_sec_per_gpu": 15.4,
+      "tokens/trainable": 12408205
+    },
+    {
+      "epoch": 2.3296969696969696,
+      "grad_norm": 0.030154094099998474,
+      "learning_rate": 0.00012909325237568496,
+      "loss": 0.0001862394856289029,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2400,
+      "tokens/total": 39346176,
+      "tokens/train_per_sec_per_gpu": 14.29,
+      "tokens/trainable": 12460035
+    },
+    {
+      "epoch": 2.3393939393939394,
+      "grad_norm": 0.0018396849045529962,
+      "learning_rate": 0.00012844481061348708,
+      "loss": 0.00013985306723043322,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00014,
+      "step": 2410,
+      "tokens/total": 39510016,
+      "tokens/train_per_sec_per_gpu": 14.37,
+      "tokens/trainable": 12512793
+    },
+    {
+      "epoch": 2.349090909090909,
+      "grad_norm": 0.007293887436389923,
+      "learning_rate": 0.00012779506488857945,
+      "loss": 0.0004945728462189436,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 2420,
+      "tokens/total": 39673856,
+      "tokens/train_per_sec_per_gpu": 14.49,
+      "tokens/trainable": 12564678
+    },
+    {
+      "epoch": 2.358787878787879,
+      "grad_norm": 0.0013043258804827929,
+      "learning_rate": 0.00012714404498650743,
+      "loss": 0.0002628775080665946,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 2430,
+      "tokens/total": 39837696,
+      "tokens/train_per_sec_per_gpu": 14.31,
+      "tokens/trainable": 12616633
+    },
+    {
+      "epoch": 2.3684848484848486,
+      "grad_norm": 0.00601148558780551,
+      "learning_rate": 0.00012649178075122702,
+      "loss": 0.0005043975077569484,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0005,
+      "step": 2440,
+      "tokens/total": 40001536,
+      "tokens/train_per_sec_per_gpu": 15.51,
+      "tokens/trainable": 12669042
+    },
+    {
+      "epoch": 2.378181818181818,
+      "grad_norm": 0.004092884249985218,
+      "learning_rate": 0.00012583830208373674,
+      "loss": 0.00020396907348185778,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 2450,
+      "tokens/total": 40165376,
+      "tokens/train_per_sec_per_gpu": 14.54,
+      "tokens/trainable": 12720635
+    },
+    {
+      "epoch": 2.3878787878787877,
+      "grad_norm": 0.004112009424716234,
+      "learning_rate": 0.00012518363894070683,
+      "loss": 0.00010208101011812686,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 2460,
+      "tokens/total": 40329216,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 12772667
+    },
+    {
+      "epoch": 2.3975757575757575,
+      "grad_norm": 0.005660182796418667,
+      "learning_rate": 0.00012452782133310624,
+      "loss": 0.0001985645852982998,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 2470,
+      "tokens/total": 40493056,
+      "tokens/train_per_sec_per_gpu": 14.85,
+      "tokens/trainable": 12824689
+    },
+    {
+      "epoch": 2.4072727272727272,
+      "grad_norm": 0.014492900110781193,
+      "learning_rate": 0.00012387087932482665,
+      "loss": 0.00014933901838958262,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00015,
+      "step": 2480,
+      "tokens/total": 40656896,
+      "tokens/train_per_sec_per_gpu": 15.28,
+      "tokens/trainable": 12876411
+    },
+    {
+      "epoch": 2.416969696969697,
+      "grad_norm": 0.0019427158404141665,
+      "learning_rate": 0.00012321284303130426,
+      "loss": 7.200292311608792e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00007,
+      "step": 2490,
+      "tokens/total": 40820736,
+      "tokens/train_per_sec_per_gpu": 15.06,
+      "tokens/trainable": 12928897
+    },
+    {
+      "epoch": 2.4266666666666667,
+      "grad_norm": 0.02509615570306778,
+      "learning_rate": 0.00012255374261813944,
+      "loss": 0.00043660206720232966,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00044,
+      "step": 2500,
+      "tokens/total": 40984576,
+      "tokens/train_per_sec_per_gpu": 14.48,
+      "tokens/trainable": 12980466
+    },
+    {
+      "epoch": 2.4363636363636365,
+      "grad_norm": 0.007349422667175531,
+      "learning_rate": 0.00012189360829971371,
+      "loss": 0.0001283957506529987,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00013,
+      "step": 2510,
+      "tokens/total": 41148416,
+      "tokens/train_per_sec_per_gpu": 15.15,
+      "tokens/trainable": 13032069
+    },
+    {
+      "epoch": 2.4460606060606063,
+      "grad_norm": 0.0029069455340504646,
+      "learning_rate": 0.00012123247033780476,
+      "loss": 6.898010615259409e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00007,
+      "step": 2520,
+      "tokens/total": 41312256,
+      "tokens/train_per_sec_per_gpu": 14.23,
+      "tokens/trainable": 13084418
+    },
+    {
+      "epoch": 2.4557575757575756,
+      "grad_norm": 0.010700283572077751,
+      "learning_rate": 0.00012057035904019913,
+      "loss": 0.00011750553967431188,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2530,
+      "tokens/total": 41476096,
+      "tokens/train_per_sec_per_gpu": 13.79,
+      "tokens/trainable": 13136606
+    },
+    {
+      "epoch": 2.4654545454545453,
+      "grad_norm": 0.002509322250261903,
+      "learning_rate": 0.00011990730475930288,
+      "loss": 0.0003227895824238658,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 2540,
+      "tokens/total": 41639936,
+      "tokens/train_per_sec_per_gpu": 14.24,
+      "tokens/trainable": 13188322
+    },
+    {
+      "epoch": 2.475151515151515,
+      "grad_norm": 0.009015699848532677,
+      "learning_rate": 0.00011924333789075013,
+      "loss": 0.00032298346050083635,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 2550,
+      "tokens/total": 41803776,
+      "tokens/train_per_sec_per_gpu": 14.52,
+      "tokens/trainable": 13240187
+    },
+    {
+      "epoch": 2.484848484848485,
+      "grad_norm": 0.3949132561683655,
+      "learning_rate": 0.00011857848887200973,
+      "loss": 0.0007695606444031,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00077,
+      "step": 2560,
+      "tokens/total": 41967616,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 13291657
+    },
+    {
+      "epoch": 2.4945454545454546,
+      "grad_norm": 0.0052930801175534725,
+      "learning_rate": 0.00011791278818098994,
+      "loss": 0.0016795439645648003,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00168,
+      "step": 2570,
+      "tokens/total": 42131456,
+      "tokens/train_per_sec_per_gpu": 14.83,
+      "tokens/trainable": 13343711
+    },
+    {
+      "epoch": 2.5042424242424244,
+      "grad_norm": 0.0013836952857673168,
+      "learning_rate": 0.00011724626633464127,
+      "loss": 0.0001935441978275776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2580,
+      "tokens/total": 42295296,
+      "tokens/train_per_sec_per_gpu": 13.84,
+      "tokens/trainable": 13396092
+    },
+    {
+      "epoch": 2.5042424242424244,
+      "eval_loss": 8.247328514698893e-05,
+      "eval_ppl": 1.00008,
+      "eval_runtime": 12.3103,
+      "eval_samples_per_second": 16.247,
+      "eval_steps_per_second": 8.123,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 14.2,
+      "memory/max_allocated (GiB)": 14.2,
+      "step": 2580
+    },
+    {
+      "epoch": 2.5139393939393937,
+      "grad_norm": 0.003271307796239853,
+      "learning_rate": 0.00011657895388755742,
+      "loss": 8.508508908562362e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00009,
+      "step": 2590,
+      "tokens/total": 42459136,
+      "tokens/train_per_sec_per_gpu": 13.47,
+      "tokens/trainable": 13448209
+    },
+    {
+      "epoch": 2.5236363636363635,
+      "grad_norm": 0.0009727279539220035,
+      "learning_rate": 0.00011591088143057483,
+      "loss": 3.968240635003895e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00004,
+      "step": 2600,
+      "tokens/total": 42622976,
+      "tokens/train_per_sec_per_gpu": 13.54,
+      "tokens/trainable": 13499718
+    },
+    {
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.0010527002159506083,
+      "learning_rate": 0.00011524207958937001,
+      "loss": 0.00018399815307930113,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00018,
+      "step": 2610,
+      "tokens/total": 42786816,
+      "tokens/train_per_sec_per_gpu": 15.62,
+      "tokens/trainable": 13551815
+    },
+    {
+      "epoch": 2.543030303030303,
+      "grad_norm": 0.023774035274982452,
+      "learning_rate": 0.00011457257902305598,
+      "loss": 0.0003953744191676378,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0004,
+      "step": 2620,
+      "tokens/total": 42950656,
+      "tokens/train_per_sec_per_gpu": 14.46,
+      "tokens/trainable": 13603678
+    },
+    {
+      "epoch": 2.5527272727272727,
+      "grad_norm": 0.013190316036343575,
+      "learning_rate": 0.00011390241042277654,
+      "loss": 0.0005875382572412491,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00059,
+      "step": 2630,
+      "tokens/total": 43114496,
+      "tokens/train_per_sec_per_gpu": 13.34,
+      "tokens/trainable": 13655501
+    },
+    {
+      "epoch": 2.5624242424242425,
+      "grad_norm": 0.019383637234568596,
+      "learning_rate": 0.00011323160451029932,
+      "loss": 0.0002609423128888011,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00026,
+      "step": 2640,
+      "tokens/total": 43278336,
+      "tokens/train_per_sec_per_gpu": 15.43,
+      "tokens/trainable": 13707528
+    },
+    {
+      "epoch": 2.5721212121212123,
+      "grad_norm": 0.002637348370626569,
+      "learning_rate": 0.00011256019203660764,
+      "loss": 0.0003633877262473106,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00036,
+      "step": 2650,
+      "tokens/total": 43442176,
+      "tokens/train_per_sec_per_gpu": 13.95,
+      "tokens/trainable": 13758801
+    },
+    {
+      "epoch": 2.581818181818182,
+      "grad_norm": 0.008525248616933823,
+      "learning_rate": 0.00011188820378049065,
+      "loss": 0.000345646683126688,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00035,
+      "step": 2660,
+      "tokens/total": 43606016,
+      "tokens/train_per_sec_per_gpu": 16.61,
+      "tokens/trainable": 13810541
+    },
+    {
+      "epoch": 2.5915151515151518,
+      "grad_norm": 0.003398684086278081,
+      "learning_rate": 0.00011121567054713244,
+      "loss": 0.00010743099264800548,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 2670,
+      "tokens/total": 43769856,
+      "tokens/train_per_sec_per_gpu": 13.59,
+      "tokens/trainable": 13861683
+    },
+    {
+      "epoch": 2.601212121212121,
+      "grad_norm": 0.048622433096170425,
+      "learning_rate": 0.00011054262316669986,
+      "loss": 0.0006771612912416458,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00068,
+      "step": 2680,
+      "tokens/total": 43933696,
+      "tokens/train_per_sec_per_gpu": 13.89,
+      "tokens/trainable": 13913157
+    },
+    {
+      "epoch": 2.610909090909091,
+      "grad_norm": 0.015018350444734097,
+      "learning_rate": 0.00010986909249292922,
+      "loss": 0.00019932850264012814,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0002,
+      "step": 2690,
+      "tokens/total": 44097536,
+      "tokens/train_per_sec_per_gpu": 14.55,
+      "tokens/trainable": 13965160
+    },
+    {
+      "epoch": 2.6206060606060606,
+      "grad_norm": 0.0012435365933924913,
+      "learning_rate": 0.00010919510940171189,
+      "loss": 5.868576117791235e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00006,
+      "step": 2700,
+      "tokens/total": 44261376,
+      "tokens/train_per_sec_per_gpu": 14.73,
+      "tokens/trainable": 14017024
+    },
+    {
+      "epoch": 2.6303030303030304,
+      "grad_norm": 0.0019523982191458344,
+      "learning_rate": 0.00010852070478967889,
+      "loss": 0.0001263051643036306,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00013,
+      "step": 2710,
+      "tokens/total": 44425216,
+      "tokens/train_per_sec_per_gpu": 13.44,
+      "tokens/trainable": 14068784
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.007224493194371462,
+      "learning_rate": 0.0001078459095727845,
+      "loss": 0.0001929138321429491,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2720,
+      "tokens/total": 44589056,
+      "tokens/train_per_sec_per_gpu": 14.47,
+      "tokens/trainable": 14120598
+    },
+    {
+      "epoch": 2.6496969696969694,
+      "grad_norm": 0.047363366931676865,
+      "learning_rate": 0.00010717075468488913,
+      "loss": 0.00019309332128614187,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00019,
+      "step": 2730,
+      "tokens/total": 44752896,
+      "tokens/train_per_sec_per_gpu": 15.31,
+      "tokens/trainable": 14172560
+    },
+    {
+      "epoch": 2.659393939393939,
+      "grad_norm": 0.001373408129438758,
+      "learning_rate": 0.00010649527107634108,
+      "loss": 9.99198411591351e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 2740,
+      "tokens/total": 44916736,
+      "tokens/train_per_sec_per_gpu": 14.44,
+      "tokens/trainable": 14223646
+    },
+    {
+      "epoch": 2.669090909090909,
+      "grad_norm": 0.0005223056650720537,
+      "learning_rate": 0.00010581948971255788,
+      "loss": 0.0001228376990184188,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2750,
+      "tokens/total": 45080576,
+      "tokens/train_per_sec_per_gpu": 14.26,
+      "tokens/trainable": 14275006
+    },
+    {
+      "epoch": 2.6787878787878787,
+      "grad_norm": 0.0011381276417523623,
+      "learning_rate": 0.00010514344157260673,
+      "loss": 5.9981108643114565e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00006,
+      "step": 2760,
+      "tokens/total": 45244416,
+      "tokens/train_per_sec_per_gpu": 13.74,
+      "tokens/trainable": 14327112
+    },
+    {
+      "epoch": 2.6884848484848485,
+      "grad_norm": 0.0028999936766922474,
+      "learning_rate": 0.00010446715764778423,
+      "loss": 0.0001589686726219952,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 2770,
+      "tokens/total": 45408256,
+      "tokens/train_per_sec_per_gpu": 15.54,
+      "tokens/trainable": 14378961
+    },
+    {
+      "epoch": 2.6981818181818182,
+      "grad_norm": 0.0008248479571193457,
+      "learning_rate": 0.00010379066894019589,
+      "loss": 0.00013254316290840508,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00013,
+      "step": 2780,
+      "tokens/total": 45572096,
+      "tokens/train_per_sec_per_gpu": 14.41,
+      "tokens/trainable": 14430803
+    },
+    {
+      "epoch": 2.707878787878788,
+      "grad_norm": 0.00010997291246894747,
+      "learning_rate": 0.00010311400646133482,
+      "loss": 0.0001163567416369915,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2790,
+      "tokens/total": 45735936,
+      "tokens/train_per_sec_per_gpu": 14.58,
+      "tokens/trainable": 14482749
+    },
+    {
+      "epoch": 2.7175757575757578,
+      "grad_norm": 0.004438972566276789,
+      "learning_rate": 0.00010243720123066011,
+      "loss": 0.0008217763155698776,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00082,
+      "step": 2800,
+      "tokens/total": 45899776,
+      "tokens/train_per_sec_per_gpu": 13.59,
+      "tokens/trainable": 14534668
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.0006182301440276206,
+      "learning_rate": 0.0001017602842741749,
+      "loss": 0.00021976977586746216,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00022,
+      "step": 2810,
+      "tokens/total": 46063616,
+      "tokens/train_per_sec_per_gpu": 15.1,
+      "tokens/trainable": 14586586
+    },
+    {
+      "epoch": 2.736969696969697,
+      "grad_norm": 0.003250017762184143,
+      "learning_rate": 0.000101083286623004,
+      "loss": 0.00012328216107562184,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2820,
+      "tokens/total": 46227456,
+      "tokens/train_per_sec_per_gpu": 13.96,
+      "tokens/trainable": 14638808
+    },
+    {
+      "epoch": 2.7466666666666666,
+      "grad_norm": 0.010098662227392197,
+      "learning_rate": 0.00010040623931197144,
+      "loss": 7.462603389285505e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00007,
+      "step": 2830,
+      "tokens/total": 46391296,
+      "tokens/train_per_sec_per_gpu": 13.8,
+      "tokens/trainable": 14690613
+    },
+    {
+      "epoch": 2.7563636363636363,
+      "grad_norm": 0.002696437295526266,
+      "learning_rate": 9.972917337817771e-05,
+      "loss": 4.609748430084437e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00005,
+      "step": 2840,
+      "tokens/total": 46555136,
+      "tokens/train_per_sec_per_gpu": 14.39,
+      "tokens/trainable": 14742309
+    },
+    {
+      "epoch": 2.766060606060606,
+      "grad_norm": 0.0002640737220644951,
+      "learning_rate": 9.905211985957706e-05,
+      "loss": 9.76522103883326e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 2850,
+      "tokens/total": 46718976,
+      "tokens/train_per_sec_per_gpu": 14.89,
+      "tokens/trainable": 14794633
+    },
+    {
+      "epoch": 2.775757575757576,
+      "grad_norm": 0.0023825804237276316,
+      "learning_rate": 9.837510979355457e-05,
+      "loss": 9.005467290990055e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00009,
+      "step": 2860,
+      "tokens/total": 46882816,
+      "tokens/train_per_sec_per_gpu": 13.85,
+      "tokens/trainable": 14846334
+    },
+    {
+      "epoch": 2.785454545454545,
+      "grad_norm": 0.007719525136053562,
+      "learning_rate": 9.769817421550335e-05,
+      "loss": 0.00035368206445127723,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00035,
+      "step": 2870,
+      "tokens/total": 47046656,
+      "tokens/train_per_sec_per_gpu": 14.04,
+      "tokens/trainable": 14898484
+    },
+    {
+      "epoch": 2.795151515151515,
+      "grad_norm": 0.0010807636426761746,
+      "learning_rate": 9.702134415740192e-05,
+      "loss": 9.26341162994504e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00009,
+      "step": 2880,
+      "tokens/total": 47210496,
+      "tokens/train_per_sec_per_gpu": 14.7,
+      "tokens/trainable": 14950418
+    },
+    {
+      "epoch": 2.8048484848484847,
+      "grad_norm": 0.02270282432436943,
+      "learning_rate": 9.634465064639153e-05,
+      "loss": 0.00013720652787014843,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00014,
+      "step": 2890,
+      "tokens/total": 47374336,
+      "tokens/train_per_sec_per_gpu": 13.75,
+      "tokens/trainable": 15002347
+    },
+    {
+      "epoch": 2.8145454545454545,
+      "grad_norm": 0.05273193120956421,
+      "learning_rate": 9.56681247033538e-05,
+      "loss": 0.0002461188472807407,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00025,
+      "step": 2900,
+      "tokens/total": 47538176,
+      "tokens/train_per_sec_per_gpu": 13.76,
+      "tokens/trainable": 15054325
+    },
+    {
+      "epoch": 2.824242424242424,
+      "grad_norm": 0.021871395409107208,
+      "learning_rate": 9.499179734148883e-05,
+      "loss": 9.564256761223078e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 2910,
+      "tokens/total": 47702016,
+      "tokens/train_per_sec_per_gpu": 14.34,
+      "tokens/trainable": 15105722
+    },
+    {
+      "epoch": 2.833939393939394,
+      "grad_norm": 0.0173841193318367,
+      "learning_rate": 9.431569956489331e-05,
+      "loss": 0.00014969281619414687,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00015,
+      "step": 2920,
+      "tokens/total": 47865856,
+      "tokens/train_per_sec_per_gpu": 14.11,
+      "tokens/trainable": 15157622
+    },
+    {
+      "epoch": 2.8436363636363637,
+      "grad_norm": 0.015775226056575775,
+      "learning_rate": 9.363986236713933e-05,
+      "loss": 0.00022732678335160016,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 2930,
+      "tokens/total": 48029696,
+      "tokens/train_per_sec_per_gpu": 13.86,
+      "tokens/trainable": 15208749
+    },
+    {
+      "epoch": 2.8533333333333335,
+      "grad_norm": 0.0024653058499097824,
+      "learning_rate": 9.296431672985363e-05,
+      "loss": 0.0001259389566257596,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00013,
+      "step": 2940,
+      "tokens/total": 48193536,
+      "tokens/train_per_sec_per_gpu": 14.64,
+      "tokens/trainable": 15260098
+    },
+    {
+      "epoch": 2.8630303030303033,
+      "grad_norm": 0.000619547616224736,
+      "learning_rate": 9.228909362129722e-05,
+      "loss": 7.931838044896721e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00008,
+      "step": 2950,
+      "tokens/total": 48357376,
+      "tokens/train_per_sec_per_gpu": 13.87,
+      "tokens/trainable": 15311590
+    },
+    {
+      "epoch": 2.8727272727272726,
+      "grad_norm": 0.016801398247480392,
+      "learning_rate": 9.16142239949458e-05,
+      "loss": 0.00022562453523278236,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00023,
+      "step": 2960,
+      "tokens/total": 48521216,
+      "tokens/train_per_sec_per_gpu": 14.63,
+      "tokens/trainable": 15363058
+    },
+    {
+      "epoch": 2.8824242424242423,
+      "grad_norm": 0.0022071890998631716,
+      "learning_rate": 9.093973878807072e-05,
+      "loss": 0.00012458593118935823,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00012,
+      "step": 2970,
+      "tokens/total": 48685056,
+      "tokens/train_per_sec_per_gpu": 14.62,
+      "tokens/trainable": 15415260
+    },
+    {
+      "epoch": 2.892121212121212,
+      "grad_norm": 0.004443019162863493,
+      "learning_rate": 9.026566892032105e-05,
+      "loss": 0.0001334903878159821,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00013,
+      "step": 2980,
+      "tokens/total": 48848896,
+      "tokens/train_per_sec_per_gpu": 14.95,
+      "tokens/trainable": 15466621
+    },
+    {
+      "epoch": 2.901818181818182,
+      "grad_norm": 0.0007753843092359602,
+      "learning_rate": 8.959204529230569e-05,
+      "loss": 0.00028287877794355156,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00028,
+      "step": 2990,
+      "tokens/total": 49012736,
+      "tokens/train_per_sec_per_gpu": 14.32,
+      "tokens/trainable": 15517914
+    },
+    {
+      "epoch": 2.9115151515151516,
+      "grad_norm": 0.0011952788336202502,
+      "learning_rate": 8.891889878417724e-05,
+      "loss": 0.000494527630507946,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00049,
+      "step": 3000,
+      "tokens/total": 49176576,
+      "tokens/train_per_sec_per_gpu": 14.87,
+      "tokens/trainable": 15569029
+    },
+    {
+      "epoch": 2.9212121212121214,
+      "grad_norm": 0.0041526807472109795,
+      "learning_rate": 8.824626025421626e-05,
+      "loss": 0.00010177484946325422,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0001,
+      "step": 3010,
+      "tokens/total": 49340416,
+      "tokens/train_per_sec_per_gpu": 15.38,
+      "tokens/trainable": 15620849
+    },
+    {
+      "epoch": 2.9309090909090907,
+      "grad_norm": 0.00011553156218724325,
+      "learning_rate": 8.757416053741649e-05,
+      "loss": 0.00010593911865726113,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00011,
+      "step": 3020,
+      "tokens/total": 49504256,
+      "tokens/train_per_sec_per_gpu": 14.3,
+      "tokens/trainable": 15671996
+    },
+    {
+      "epoch": 2.9406060606060604,
+      "grad_norm": 0.0024511946830898523,
+      "learning_rate": 8.690263044407168e-05,
+      "loss": 0.0001637642504647374,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 3030,
+      "tokens/total": 49668096,
+      "tokens/train_per_sec_per_gpu": 14.01,
+      "tokens/trainable": 15723682
+    },
+    {
+      "epoch": 2.95030303030303,
+      "grad_norm": 0.007446048315614462,
+      "learning_rate": 8.62317007583628e-05,
+      "loss": 5.339759518392384e-05,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00005,
+      "step": 3040,
+      "tokens/total": 49831936,
+      "tokens/train_per_sec_per_gpu": 15.18,
+      "tokens/trainable": 15775385
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.0077268267050385475,
+      "learning_rate": 8.556140223694718e-05,
+      "loss": 0.00031895393040031194,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00032,
+      "step": 3050,
+      "tokens/total": 49995776,
+      "tokens/train_per_sec_per_gpu": 15.09,
+      "tokens/trainable": 15827301
+    },
+    {
+      "epoch": 2.9696969696969697,
+      "grad_norm": 0.038065724074840546,
+      "learning_rate": 8.489176560754834e-05,
+      "loss": 0.00015137892914935948,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00015,
+      "step": 3060,
+      "tokens/total": 50159616,
+      "tokens/train_per_sec_per_gpu": 15.25,
+      "tokens/trainable": 15879448
+    },
+    {
+      "epoch": 2.9793939393939395,
+      "grad_norm": 0.01792677491903305,
+      "learning_rate": 8.422282156754741e-05,
+      "loss": 0.00016337501583620905,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00016,
+      "step": 3070,
+      "tokens/total": 50323456,
+      "tokens/train_per_sec_per_gpu": 14.83,
+      "tokens/trainable": 15930723
+    },
+    {
+      "epoch": 2.9890909090909092,
+      "grad_norm": 0.03399665653705597,
+      "learning_rate": 8.355460078257607e-05,
+      "loss": 0.0003045425517484546,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.0003,
+      "step": 3080,
+      "tokens/total": 50487296,
+      "tokens/train_per_sec_per_gpu": 14.33,
+      "tokens/trainable": 15981910
+    },
+    {
+      "epoch": 2.998787878787879,
+      "grad_norm": 0.029494913294911385,
+      "learning_rate": 8.288713388511047e-05,
+      "loss": 0.0003337380010634661,
+      "memory/device_reserved (GiB)": 20.01,
+      "memory/max_active (GiB)": 16.23,
+      "memory/max_allocated (GiB)": 16.23,
+      "ppl": 1.00033,
+      "step": 3090,
+      "tokens/total": 50651136,
+      "tokens/train_per_sec_per_gpu": 15.43,
+      "tokens/trainable": 16034665
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5155,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 1031,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1250980409971835e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}