{"step": 0, "timestamp": 1778325713.1069465, "grad/layer_0/attn": 0.01274854689836502, "grad/layer_0/mlp": 0.010575518943369389, "grad/layer_0/attn_mlp_ratio": 1.2054771823571224, "grad/layer_4/attn": 0.013495877385139465, "grad/layer_4/mlp": 0.01174549013376236, "grad/layer_4/attn_mlp_ratio": 1.1490263170408694, "grad/layer_8/attn": 0.02344369888305664, "grad/layer_8/mlp": 0.012149626389145851, "grad/layer_8/attn_mlp_ratio": 1.929581860314851, "grad/layer_12/attn": 0.037834133952856064, "grad/layer_12/mlp": 0.03287697583436966, "grad/layer_12/attn_mlp_ratio": 1.1507790141155951, "grad/layer_16/attn": 0.012019659392535686, "grad/layer_16/mlp": 0.00824613869190216, "grad/layer_16/attn_mlp_ratio": 1.4576106097485508, "grad/layer_20/attn": 0.008193372748792171, "grad/layer_20/mlp": 0.014974324963986874, "grad/layer_20/attn_mlp_ratio": 0.5471614055245244, "grad/layer_24/attn": 0.023483537137508392, "grad/layer_24/mlp": 0.0186799056828022, "grad/layer_24/attn_mlp_ratio": 1.257155009803566, "grad/layer_27/attn": 0.0240637119859457, "grad/layer_27/mlp": 0.0157768614590168, "grad/layer_27/attn_mlp_ratio": 1.525253415955393} {"step": 0, "timestamp": 1778325714.6027052, "eos/sharpness": 31.60500526428222, "eos/L0_probe": 2.7444920539855957, "eos/L_plus": 2.902123212814331, "eos/L_minus": 2.9029109477996826, "eos/grad_norm": 0.2873528301715851, "eos/embed_grad_frac": 0.14796945452690125, "eos/time_s": 1.4911069869995117} {"step": 0, "timestamp": 1778325714.8377905, "train/loss": 2.7233519554138184, "train/z_loss": 0.0012290733866393566, "train/perplexity": 15.231291383704246, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.0, "optim/adamw_lr": 0.0, "perf/tokens_per_sec": 30986.574377984827, "perf/iters_per_sec": 0.01477555006884805, "perf/gpu_mem_gb": 77.567553024, "perf/step_time_s": 67.67937541007996, "data/tokens_consumed": 2097152, "data/tokens_consumed_B": 0.002097152} {"step": 0, "timestamp": 1778325716.4002905, "geo/rankme_last": 414.6168212890625, "geo/layer_0/stable_rank_q_proj": 19.193296432495117, "geo/layer_0/stable_rank_k_proj": 15.867403984069824, "geo/layer_0/stable_rank_o_proj": 46.774322509765625, "geo/layer_0/stable_rank_gate_proj": 129.22052001953125, "geo/layer_0/stable_rank_down_proj": 56.18280029296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06415507942438126, "geo/layer_0/attn_entropy_mean": 6.1754608154296875, "geo/layer_0/attn_entropy_std": 0.40654894709587097, "geo/layer_7/stable_rank_q_proj": 42.9261589050293, "geo/layer_7/stable_rank_k_proj": 40.220340728759766, "geo/layer_7/stable_rank_o_proj": 89.42730712890625, "geo/layer_7/stable_rank_gate_proj": 78.60363006591797, "geo/layer_7/stable_rank_down_proj": 139.818115234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.424412339925766, "geo/layer_7/attn_entropy_mean": 4.719477653503418, "geo/layer_7/attn_entropy_std": 0.7567178606987, "geo/layer_14/stable_rank_q_proj": 50.32963943481445, "geo/layer_14/stable_rank_k_proj": 41.07245635986328, "geo/layer_14/stable_rank_o_proj": 43.46257400512695, "geo/layer_14/stable_rank_gate_proj": 70.91988372802734, "geo/layer_14/stable_rank_down_proj": 126.74287414550781, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4118182957172394, "geo/layer_14/attn_entropy_mean": 5.628140449523926, "geo/layer_14/attn_entropy_std": 0.3739416003227234, "geo/layer_21/stable_rank_q_proj": 39.94040298461914, "geo/layer_21/stable_rank_k_proj": 30.13866424560547, "geo/layer_21/stable_rank_o_proj": 68.94437408447266, "geo/layer_21/stable_rank_gate_proj": 64.3838882446289, "geo/layer_21/stable_rank_down_proj": 49.86330032348633, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15011252462863922, "geo/layer_21/attn_entropy_mean": 5.924209117889404, "geo/layer_21/attn_entropy_std": 0.30203068256378174, "geo/layer_27/stable_rank_q_proj": 43.58454895019531, "geo/layer_27/stable_rank_k_proj": 32.187374114990234, "geo/layer_27/stable_rank_o_proj": 115.17939758300781, "geo/layer_27/stable_rank_gate_proj": 78.0197982788086, "geo/layer_27/stable_rank_down_proj": 127.81610870361328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11942044645547867, "geo/layer_27/attn_entropy_mean": 4.347089767456055, "geo/layer_27/attn_entropy_std": 0.6140678524971008, "attnres/final_alpha/block_0": 0.24166473746299744, "attnres/block_norm/0": 1.7861855030059814, "attnres/final_alpha/block_1": 0.004180207382887602, "attnres/block_norm/1": 47484.453125, "attnres/final_alpha/block_2": 0.009070562198758125, "attnres/block_norm/2": 28823.330078125, "attnres/final_alpha/block_3": 0.010329082608222961, "attnres/block_norm/3": 62658.16015625, "attnres/final_alpha/block_4": 0.012481547892093658, "attnres/block_norm/4": 15732.85546875, "attnres/final_alpha/block_5": 0.6104215383529663, "attnres/block_norm/5": 6249.4189453125, "attnres/final_alpha/block_6": 0.11185230314731598, "attnres/block_norm/6": 42571.29296875, "geo/tier1_time_s": 1.5578515529632568, "geo/step": 0.0} {"step": 0, "timestamp": 1778325723.658889, "geo/ww_alpha_mean": 7.489332379498283, "geo/ww_alpha_std": 4.373473276935544, "geo/ww_alpha_min": 1.3433596525156066, "geo/ww_alpha_max": 31.328776671143974, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.940450223533285, "geo/ww_alpha_by_type/k_proj": 4.495009484239717, "geo/ww_alpha_by_type/v_proj": 7.878990825337837, "geo/ww_alpha_by_type/o_proj": 8.746923665315565, "geo/ww_alpha_by_type/gate_proj": 7.734723995565899, "geo/ww_alpha_by_type/up_proj": 11.50472885209436, "geo/ww_alpha_by_type/down_proj": 8.214260212517024, "geo/twonn_id/layer_0": 0.8052487373352051, "geo/twonn_id/layer_7": 3.562678098678589, "geo/twonn_id/layer_14": 5.854478359222412, "geo/twonn_id/layer_21": 8.970542907714844, "geo/twonn_id/layer_27": 5.927097797393799, "geo/tier2_time_s": 7.252217769622803} {"step": 0, "timestamp": 1778325724.9491863, "eoc/jacobian_sigma/layer_0/attn": 1510.6783447265625, "eoc/jacobian_sigma/layer_0/mlp": 7496.0986328125, "eoc/jacobian_sigma/layer_0": 7496.0986328125, "eoc/jacobian_sigma/layer_7/attn": 1.0177288055419922, "eoc/jacobian_sigma/layer_7/mlp": 1.348183512687683, "eoc/jacobian_sigma/layer_7": 1.348183512687683, "eoc/jacobian_sigma/layer_14/attn": 1.2341843843460083, "eoc/jacobian_sigma/layer_14/mlp": 7.28255033493042, "eoc/jacobian_sigma/layer_14": 7.28255033493042, "eoc/jacobian_sigma/layer_21/attn": 1.006435513496399, "eoc/jacobian_sigma/layer_21/mlp": 3.1554949283599854, "eoc/jacobian_sigma/layer_21": 3.1554949283599854, "eoc/jacobian_sigma/layer_27/attn": 2.4914205074310303, "eoc/jacobian_sigma/layer_27/mlp": 16.65950584411621, "eoc/jacobian_sigma/layer_27": 16.65950584411621, "eoc/layer0_sigma": 7496.0986328125, "eoc/sigma_max": 16.65950584411621, "eoc/sigma_min": 1.348183512687683, "eoc/sigma_mean": 7.111433655023575, "eoc/time_s": 1.2826380729675293} {"step": 10, "timestamp": 1778325735.3362224, "train/loss": 2.6329731702804566, "train/z_loss": 0.001201186585240066, "train/perplexity": 13.915080363567776, "train/grad_norm": 1.2734375, "optim/muon_lr": 0.0004, "optim/adamw_lr": 1.1999999999999999e-05, "perf/tokens_per_sec": 1023310.5040953859, "perf/iters_per_sec": 0.4879524727322511, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 2.0493799209594727, "data/tokens_consumed": 23068672, "data/tokens_consumed_B": 0.023068672} {"step": 20, "timestamp": 1778325745.7074263, "train/loss": 2.661491537094116, "train/z_loss": 0.0012089664349332451, "train/perplexity": 14.31762845444072, "train/grad_norm": 1.28125, "optim/muon_lr": 0.0008, "optim/adamw_lr": 2.3999999999999997e-05, "perf/tokens_per_sec": 2023623.7450106447, "perf/iters_per_sec": 0.964938995843241, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.0363349437713623, "data/tokens_consumed": 44040192, "data/tokens_consumed_B": 0.044040192} {"step": 30, "timestamp": 1778325756.5791347, "train/loss": 2.6678016424179076, "train/z_loss": 0.001208654942456633, "train/perplexity": 14.40825984405667, "train/grad_norm": 1.0546875, "optim/muon_lr": 0.0012, "optim/adamw_lr": 3.5999999999999994e-05, "perf/tokens_per_sec": 1930325.3031628756, "perf/iters_per_sec": 0.9204508319677713, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.086424136161804, "data/tokens_consumed": 65011712, "data/tokens_consumed_B": 0.065011712} {"step": 40, "timestamp": 1778325766.9484584, "train/loss": 2.6290159463882445, "train/z_loss": 0.0012269214610569179, "train/perplexity": 13.860124083958553, "train/grad_norm": 0.6015625, "optim/muon_lr": 0.0016, "optim/adamw_lr": 4.7999999999999994e-05, "perf/tokens_per_sec": 2023643.6243460528, "perf/iters_per_sec": 0.964948475049044, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.0363247632980346, "data/tokens_consumed": 85983232, "data/tokens_consumed_B": 0.085983232} {"step": 50, "timestamp": 1778325777.3160207, "grad/layer_0/attn": 0.005251817870885134, "grad/layer_0/mlp": 0.005103958770632744, "grad/layer_0/attn_mlp_ratio": 1.0289694733049564, "grad/layer_4/attn": 0.003493203781545162, "grad/layer_4/mlp": 0.004177029710263014, "grad/layer_4/attn_mlp_ratio": 0.8362889278315222, "grad/layer_8/attn": 0.00614055385813117, "grad/layer_8/mlp": 0.004842112306505442, "grad/layer_8/attn_mlp_ratio": 1.2681559911499072, "grad/layer_12/attn": 0.007308659143745899, "grad/layer_12/mlp": 0.010062147863209248, "grad/layer_12/attn_mlp_ratio": 0.7263517859674622, "grad/layer_16/attn": 0.005596315488219261, "grad/layer_16/mlp": 0.005321214906871319, "grad/layer_16/attn_mlp_ratio": 1.0516988095750133, "grad/layer_20/attn": 0.004985390231013298, "grad/layer_20/mlp": 0.007716618478298187, "grad/layer_20/attn_mlp_ratio": 0.6460589156284008, "grad/layer_24/attn": 0.015764154493808746, "grad/layer_24/mlp": 0.014783293008804321, "grad/layer_24/attn_mlp_ratio": 1.06634931593288, "grad/layer_27/attn": 0.009734280407428741, "grad/layer_27/mlp": 0.013607571832835674, "grad/layer_27/attn_mlp_ratio": 0.7153576299633215} {"step": 50, "timestamp": 1778325777.3326008, "train/loss": 2.6275009393692015, "train/z_loss": 0.0012247079983353614, "train/perplexity": 13.839141796855959, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.002, "optim/adamw_lr": 5.9999999999999995e-05, "perf/tokens_per_sec": 2020752.05462387, "perf/iters_per_sec": 0.9635696671599722, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.0378076791763307, "data/tokens_consumed": 106954752, "data/tokens_consumed_B": 0.106954752} {"step": 60, "timestamp": 1778325787.6985753, "train/loss": 2.6499736309051514, "train/z_loss": 0.0012271335697732865, "train/perplexity": 14.153665421109086, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.0024, "optim/adamw_lr": 7.199999999999999e-05, "perf/tokens_per_sec": 2024360.1928847805, "perf/iters_per_sec": 0.9652901615547087, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.035957932472229, "data/tokens_consumed": 127926272, "data/tokens_consumed_B": 0.127926272} {"step": 70, "timestamp": 1778325798.0708613, "train/loss": 2.6093976736068725, "train/z_loss": 0.0012324419687502087, "train/perplexity": 13.590862250934173, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.0028000000000000004, "optim/adamw_lr": 8.4e-05, "perf/tokens_per_sec": 2023156.4371018677, "perf/iters_per_sec": 0.964716166068014, "perf/gpu_mem_gb": 77.834036736, "perf/step_time_s": 1.0365743160247802, "data/tokens_consumed": 148897792, "data/tokens_consumed_B": 0.148897792} {"step": 75, "timestamp": 1778325803.8706377, "eos/sharpness": 70.4648733139038, "eos/L0_probe": 2.617017984390259, "eos/L_plus": 2.9164748191833496, "eos/L_minus": 3.022209882736206, "eos/grad_norm": 0.22952648997306824, "eos/embed_grad_frac": 0.0721876472234726, "eos/time_s": 0.6282720565795898} {"step": 75, "timestamp": 1778325805.2549243, "geo/rankme_last": 419.4337463378906, "geo/layer_0/stable_rank_q_proj": 19.194869995117188, "geo/layer_0/stable_rank_k_proj": 15.870049476623535, "geo/layer_0/stable_rank_o_proj": 46.776397705078125, "geo/layer_0/stable_rank_gate_proj": 129.22052001953125, "geo/layer_0/stable_rank_down_proj": 56.17716979980469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0643659308552742, "geo/layer_0/attn_entropy_mean": 6.176344394683838, "geo/layer_0/attn_entropy_std": 0.4060834050178528, "geo/layer_7/stable_rank_q_proj": 42.9242057800293, "geo/layer_7/stable_rank_k_proj": 40.21754455566406, "geo/layer_7/stable_rank_o_proj": 89.41863250732422, "geo/layer_7/stable_rank_gate_proj": 78.6012191772461, "geo/layer_7/stable_rank_down_proj": 139.80958557128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44074955582618713, "geo/layer_7/attn_entropy_mean": 4.767570495605469, "geo/layer_7/attn_entropy_std": 0.7620633840560913, "geo/layer_14/stable_rank_q_proj": 50.329288482666016, "geo/layer_14/stable_rank_k_proj": 41.06708526611328, "geo/layer_14/stable_rank_o_proj": 43.4610595703125, "geo/layer_14/stable_rank_gate_proj": 70.91854095458984, "geo/layer_14/stable_rank_down_proj": 126.72139739990234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3891686201095581, "geo/layer_14/attn_entropy_mean": 5.631989479064941, "geo/layer_14/attn_entropy_std": 0.3773915469646454, "geo/layer_21/stable_rank_q_proj": 39.941680908203125, "geo/layer_21/stable_rank_k_proj": 30.137243270874023, "geo/layer_21/stable_rank_o_proj": 68.94126892089844, "geo/layer_21/stable_rank_gate_proj": 64.38775634765625, "geo/layer_21/stable_rank_down_proj": 49.864139556884766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15084579586982727, "geo/layer_21/attn_entropy_mean": 5.933197975158691, "geo/layer_21/attn_entropy_std": 0.2984740138053894, "geo/layer_27/stable_rank_q_proj": 43.5896110534668, "geo/layer_27/stable_rank_k_proj": 32.1873779296875, "geo/layer_27/stable_rank_o_proj": 115.18305206298828, "geo/layer_27/stable_rank_gate_proj": 78.02488708496094, "geo/layer_27/stable_rank_down_proj": 127.8525161743164, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1240856945514679, "geo/layer_27/attn_entropy_mean": 4.355798721313477, "geo/layer_27/attn_entropy_std": 0.6234311461448669, "attnres/final_alpha/block_0": 0.23601442575454712, "attnres/block_norm/0": 1.7861734628677368, "attnres/final_alpha/block_1": 0.0039263516664505005, "attnres/block_norm/1": 47458.44140625, "attnres/final_alpha/block_2": 0.009324129670858383, "attnres/block_norm/2": 28887.923828125, "attnres/final_alpha/block_3": 0.0106833316385746, "attnres/block_norm/3": 63347.8828125, "attnres/final_alpha/block_4": 0.012332159094512463, "attnres/block_norm/4": 15737.4609375, "attnres/final_alpha/block_5": 0.6284816265106201, "attnres/block_norm/5": 6433.1552734375, "attnres/final_alpha/block_6": 0.09923794865608215, "attnres/block_norm/6": 42430.1796875, "geo/tier1_time_s": 1.3636574745178223, "geo/step": 75.0} {"step": 80, "timestamp": 1778325810.4376981, "train/loss": 2.68451189994812, "train/z_loss": 0.0012168362969532608, "train/perplexity": 14.651048457221595, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.0032, "optim/adamw_lr": 9.599999999999999e-05, "perf/tokens_per_sec": 1696848.7564325146, "perf/iters_per_sec": 0.8091205389177869, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.2359097957611085, "data/tokens_consumed": 169869312, "data/tokens_consumed_B": 0.169869312} {"step": 90, "timestamp": 1778325820.8018854, "train/loss": 2.642735147476196, "train/z_loss": 0.001228572044055909, "train/perplexity": 14.051584250646798, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.0036, "optim/adamw_lr": 0.00010799999999999998, "perf/tokens_per_sec": 2024741.9240291072, "perf/iters_per_sec": 0.9654721851487671, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035762619972229, "data/tokens_consumed": 190840832, "data/tokens_consumed_B": 0.190840832, "train/loss_slope": -0.00041227183197483947} {"step": 100, "timestamp": 1778325831.1475384, "grad/layer_0/attn": 0.0047237081453204155, "grad/layer_0/mlp": 0.004799042362719774, "grad/layer_0/attn_mlp_ratio": 0.9843022190396157, "grad/layer_4/attn": 0.0027522731106728315, "grad/layer_4/mlp": 0.003724041860550642, "grad/layer_4/attn_mlp_ratio": 0.7390553435831525, "grad/layer_8/attn": 0.004398517310619354, "grad/layer_8/mlp": 0.004359944257885218, "grad/layer_8/attn_mlp_ratio": 1.0088471204143639, "grad/layer_12/attn": 0.0054624043405056, "grad/layer_12/mlp": 0.0077866376377642155, "grad/layer_12/attn_mlp_ratio": 0.7015100129820635, "grad/layer_16/attn": 0.004108484368771315, "grad/layer_16/mlp": 0.004487740341573954, "grad/layer_16/attn_mlp_ratio": 0.9154906399467201, "grad/layer_20/attn": 0.003776676021516323, "grad/layer_20/mlp": 0.005687240976840258, "grad/layer_20/attn_mlp_ratio": 0.6640611801908326, "grad/layer_24/attn": 0.005468001589179039, "grad/layer_24/mlp": 0.008232107385993004, "grad/layer_24/attn_mlp_ratio": 0.664228643574308, "grad/layer_27/attn": 0.005026939790695906, "grad/layer_27/mlp": 0.007965147495269775, "grad/layer_27/attn_mlp_ratio": 0.6311169668320058} {"step": 100, "timestamp": 1778325831.1633894, "train/loss": 2.6361597776412964, "train/z_loss": 0.0012178237433545292, "train/perplexity": 13.959492986294043, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.004, "optim/adamw_lr": 0.00011999999999999999, "perf/tokens_per_sec": 2025164.5963456342, "perf/iters_per_sec": 0.965673731015031, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0355464458465575, "data/tokens_consumed": 211812352, "data/tokens_consumed_B": 0.211812352, "train/loss_slope": -0.0003851837678389119} {"step": 110, "timestamp": 1778325841.5302064, "train/loss": 2.6177937746047975, "train/z_loss": 0.0012331146630458535, "train/perplexity": 13.705452887152665, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.0044, "optim/adamw_lr": 0.00013199999999999998, "perf/tokens_per_sec": 2024385.4911662836, "perf/iters_per_sec": 0.9653022247153681, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0359449863433838, "data/tokens_consumed": 232783872, "data/tokens_consumed_B": 0.232783872, "train/loss_slope": -0.00042537975144552986} {"step": 120, "timestamp": 1778325851.9104793, "train/loss": 2.6106740951538088, "train/z_loss": 0.001236984960269183, "train/perplexity": 13.60822099653435, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.0048, "optim/adamw_lr": 0.00014399999999999998, "perf/tokens_per_sec": 2022136.8358371456, "perf/iters_per_sec": 0.9642299822984436, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0370969772338867, "data/tokens_consumed": 253755392, "data/tokens_consumed_B": 0.253755392, "train/loss_slope": -0.00045912198967985955} {"step": 130, "timestamp": 1778325862.2748291, "train/loss": 2.5801828384399412, "train/z_loss": 0.0012435590964742006, "train/perplexity": 13.199551324414076, "train/grad_norm": 0.09375, "optim/muon_lr": 0.005200000000000001, "optim/adamw_lr": 0.000156, "perf/tokens_per_sec": 2024813.8875276581, "perf/iters_per_sec": 0.9655065000189105, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0357258081436158, "data/tokens_consumed": 274726912, "data/tokens_consumed_B": 0.274726912, "train/loss_slope": -0.0005543315017616353} {"step": 140, "timestamp": 1778325872.630965, "train/loss": 2.6213156938552857, "train/z_loss": 0.0012352317222394048, "train/perplexity": 13.753807486026716, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.005600000000000001, "optim/adamw_lr": 0.000168, "perf/tokens_per_sec": 2026299.1829530238, "perf/iters_per_sec": 0.966214744068634, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349666118621825, "data/tokens_consumed": 295698432, "data/tokens_consumed_B": 0.295698432, "train/loss_slope": -0.0004995272585323874} {"step": 150, "timestamp": 1778325882.9888978, "grad/layer_0/attn": 0.0037923965137451887, "grad/layer_0/mlp": 0.0033957611303776503, "grad/layer_0/attn_mlp_ratio": 1.1168030542958487, "grad/layer_4/attn": 0.002209186088293791, "grad/layer_4/mlp": 0.0028093375731259584, "grad/layer_4/attn_mlp_ratio": 0.7863725708115474, "grad/layer_8/attn": 0.0033580991439521313, "grad/layer_8/mlp": 0.0035276475828140974, "grad/layer_8/attn_mlp_ratio": 0.9519372244320292, "grad/layer_12/attn": 0.005488509312272072, "grad/layer_12/mlp": 0.0070423255674541, "grad/layer_12/attn_mlp_ratio": 0.779360337968614, "grad/layer_16/attn": 0.003624686971306801, "grad/layer_16/mlp": 0.004447749350219965, "grad/layer_16/attn_mlp_ratio": 0.8149485513684996, "grad/layer_20/attn": 0.005331417080014944, "grad/layer_20/mlp": 0.005368197336792946, "grad/layer_20/attn_mlp_ratio": 0.9931484716776784, "grad/layer_24/attn": 0.008703801780939102, "grad/layer_24/mlp": 0.007654668763279915, "grad/layer_24/attn_mlp_ratio": 1.1370579101980447, "grad/layer_27/attn": 0.004735023248940706, "grad/layer_27/mlp": 0.006882656831294298, "grad/layer_27/attn_mlp_ratio": 0.6879644439941996} {"step": 150, "timestamp": 1778325883.5952344, "eos/sharpness": 19.17295455932617, "eos/L0_probe": 2.5726475715637207, "eos/L_plus": 2.6631436347961426, "eos/L_minus": 2.6738810539245605, "eos/grad_norm": 0.09578008949756622, "eos/embed_grad_frac": 0.27402156591415405, "eos/time_s": 0.6033017635345459} {"step": 150, "timestamp": 1778325883.6153, "train/loss": 2.6009267807006835, "train/z_loss": 0.0012391583062708379, "train/perplexity": 13.47622175151181, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.006, "optim/adamw_lr": 0.00017999999999999998, "perf/tokens_per_sec": 1910461.346821042, "perf/iters_per_sec": 0.9109789594750605, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.097720193862915, "data/tokens_consumed": 316669952, "data/tokens_consumed_B": 0.316669952, "train/loss_slope": -0.000496813332333284} {"step": 150, "timestamp": 1778325884.9855695, "geo/rankme_last": 419.4991760253906, "geo/layer_0/stable_rank_q_proj": 19.209312438964844, "geo/layer_0/stable_rank_k_proj": 15.884905815124512, "geo/layer_0/stable_rank_o_proj": 46.78336715698242, "geo/layer_0/stable_rank_gate_proj": 129.22964477539062, "geo/layer_0/stable_rank_down_proj": 56.13589859008789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06683249771595001, "geo/layer_0/attn_entropy_mean": 6.179607391357422, "geo/layer_0/attn_entropy_std": 0.40419942140579224, "geo/layer_7/stable_rank_q_proj": 42.90888977050781, "geo/layer_7/stable_rank_k_proj": 40.21040344238281, "geo/layer_7/stable_rank_o_proj": 89.42527770996094, "geo/layer_7/stable_rank_gate_proj": 78.59483337402344, "geo/layer_7/stable_rank_down_proj": 139.77024841308594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4416158199310303, "geo/layer_7/attn_entropy_mean": 4.765964508056641, "geo/layer_7/attn_entropy_std": 0.7667602896690369, "geo/layer_14/stable_rank_q_proj": 50.33415985107422, "geo/layer_14/stable_rank_k_proj": 41.0533447265625, "geo/layer_14/stable_rank_o_proj": 43.44756317138672, "geo/layer_14/stable_rank_gate_proj": 70.9219970703125, "geo/layer_14/stable_rank_down_proj": 126.66934967041016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40459245443344116, "geo/layer_14/attn_entropy_mean": 5.625161170959473, "geo/layer_14/attn_entropy_std": 0.38973984122276306, "geo/layer_21/stable_rank_q_proj": 39.94717025756836, "geo/layer_21/stable_rank_k_proj": 30.147783279418945, "geo/layer_21/stable_rank_o_proj": 68.94711303710938, "geo/layer_21/stable_rank_gate_proj": 64.38642120361328, "geo/layer_21/stable_rank_down_proj": 49.853904724121094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15083365142345428, "geo/layer_21/attn_entropy_mean": 5.929939270019531, "geo/layer_21/attn_entropy_std": 0.30286040902137756, "geo/layer_27/stable_rank_q_proj": 43.59410858154297, "geo/layer_27/stable_rank_k_proj": 32.1889533996582, "geo/layer_27/stable_rank_o_proj": 115.2239990234375, "geo/layer_27/stable_rank_gate_proj": 78.01119232177734, "geo/layer_27/stable_rank_down_proj": 127.81905364990234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.12125550210475922, "geo/layer_27/attn_entropy_mean": 4.354985237121582, "geo/layer_27/attn_entropy_std": 0.6015642881393433, "attnres/final_alpha/block_0": 0.2388896942138672, "attnres/block_norm/0": 1.7863727807998657, "attnres/final_alpha/block_1": 0.003973871003836393, "attnres/block_norm/1": 47663.4375, "attnres/final_alpha/block_2": 0.009344879537820816, "attnres/block_norm/2": 28983.87890625, "attnres/final_alpha/block_3": 0.0107130017131567, "attnres/block_norm/3": 64178.953125, "attnres/final_alpha/block_4": 0.012566652148962021, "attnres/block_norm/4": 15767.6103515625, "attnres/final_alpha/block_5": 0.6232642531394958, "attnres/block_norm/5": 6519.203125, "attnres/final_alpha/block_6": 0.10124765336513519, "attnres/block_norm/6": 42194.390625, "geo/tier1_time_s": 1.365804672241211, "geo/step": 150.0} {"step": 160, "timestamp": 1778325896.0247965, "train/loss": 2.5509734392166137, "train/z_loss": 0.0012493264744989573, "train/perplexity": 12.819576789608437, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.0064, "optim/adamw_lr": 0.00019199999999999998, "perf/tokens_per_sec": 1690588.7159639762, "perf/iters_per_sec": 0.8061355190105325, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.240486216545105, "data/tokens_consumed": 337641472, "data/tokens_consumed_B": 0.337641472, "train/loss_slope": -0.0005831571244725993} {"step": 170, "timestamp": 1778325906.3846755, "train/loss": 2.5426226615905763, "train/z_loss": 0.001255085866432637, "train/perplexity": 12.712969102649543, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.0068000000000000005, "optim/adamw_lr": 0.000204, "perf/tokens_per_sec": 2025889.8947875022, "perf/iters_per_sec": 0.9660195802629005, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351757049560546, "data/tokens_consumed": 358612992, "data/tokens_consumed_B": 0.358612992, "train/loss_slope": -0.0006481689196252968} {"step": 180, "timestamp": 1778325916.743184, "train/loss": 2.5820547103881837, "train/z_loss": 0.0012449844158254563, "train/perplexity": 13.224282333687631, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.0072, "optim/adamw_lr": 0.00021599999999999996, "perf/tokens_per_sec": 2025840.9499536671, "perf/iters_per_sec": 0.9659962415474258, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0352007150650024, "data/tokens_consumed": 379584512, "data/tokens_consumed_B": 0.379584512, "train/loss_slope": -0.0006222083024811321} {"step": 190, "timestamp": 1778325927.1061718, "train/loss": 2.6069161176681517, "train/z_loss": 0.001256444479804486, "train/perplexity": 13.557177578488693, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.0076, "optim/adamw_lr": 0.00022799999999999999, "perf/tokens_per_sec": 2025279.6765628485, "perf/iters_per_sec": 0.9657286055387728, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0354876041412353, "data/tokens_consumed": 400556032, "data/tokens_consumed_B": 0.400556032, "train/loss_slope": -0.0005588891524121275} {"step": 200, "timestamp": 1778325937.4499087, "grad/layer_0/attn": 0.00319800921715796, "grad/layer_0/mlp": 0.0032348341774195433, "grad/layer_0/attn_mlp_ratio": 0.9886160906236721, "grad/layer_4/attn": 0.0033140676096081734, "grad/layer_4/mlp": 0.002800198970362544, "grad/layer_4/attn_mlp_ratio": 1.183511431270884, "grad/layer_8/attn": 0.003728110110387206, "grad/layer_8/mlp": 0.0037324719596654177, "grad/layer_8/attn_mlp_ratio": 0.9988313511237369, "grad/layer_12/attn": 0.005155641119927168, "grad/layer_12/mlp": 0.007104174233973026, "grad/layer_12/attn_mlp_ratio": 0.7257199609069649, "grad/layer_16/attn": 0.0039649661630392075, "grad/layer_16/mlp": 0.004533651750534773, "grad/layer_16/attn_mlp_ratio": 0.8745634410748843, "grad/layer_20/attn": 0.00353236380033195, "grad/layer_20/mlp": 0.005474377889186144, "grad/layer_20/attn_mlp_ratio": 0.645253909633137, "grad/layer_24/attn": 0.007332888897508383, "grad/layer_24/mlp": 0.00758332759141922, "grad/layer_24/attn_mlp_ratio": 0.9669750795295031, "grad/layer_27/attn": 0.003979876637458801, "grad/layer_27/mlp": 0.006352261174470186, "grad/layer_27/attn_mlp_ratio": 0.6265291154590843} {"step": 200, "timestamp": 1778325937.4647741, "train/loss": 2.5519673585891725, "train/z_loss": 0.0012752150883898138, "train/perplexity": 12.832324749500332, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.008, "optim/adamw_lr": 0.00023999999999999998, "perf/tokens_per_sec": 2025868.1982633315, "perf/iters_per_sec": 0.966009234553972, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351867914199828, "data/tokens_consumed": 421527552, "data/tokens_consumed_B": 0.421527552, "train/loss_slope": -0.0005761202589257969} {"step": 210, "timestamp": 1778325947.8260443, "train/loss": 2.5923482179641724, "train/z_loss": 0.0012535246671177447, "train/perplexity": 13.361109592146883, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.0084, "optim/adamw_lr": 0.00025199999999999995, "perf/tokens_per_sec": 2025380.778890211, "perf/iters_per_sec": 0.96577681488524, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0354359149932861, "data/tokens_consumed": 442499072, "data/tokens_consumed_B": 0.442499072, "train/loss_slope": -0.0005343467081286418} {"step": 220, "timestamp": 1778325958.841748, "train/loss": 2.5551630020141602, "train/z_loss": 0.0012778038624674081, "train/perplexity": 12.873397876293478, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.0088, "optim/adamw_lr": 0.00026399999999999997, "perf/tokens_per_sec": 1905143.6954397936, "perf/iters_per_sec": 0.9084433057021111, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1007841587066651, "data/tokens_consumed": 463470592, "data/tokens_consumed_B": 0.463470592, "train/loss_slope": -0.0005371730812924649} {"step": 225, "timestamp": 1778325964.6056457, "eos/sharpness": 16.704940795898434, "eos/L0_probe": 2.535898447036743, "eos/L_plus": 2.6403045654296875, "eos/L_minus": 2.598541736602783, "eos/grad_norm": 0.09259609878063202, "eos/embed_grad_frac": 0.3008134663105011, "eos/time_s": 0.593252420425415} {"step": 225, "timestamp": 1778325965.9914606, "geo/rankme_last": 419.0769348144531, "geo/layer_0/stable_rank_q_proj": 19.238765716552734, "geo/layer_0/stable_rank_k_proj": 15.9253511428833, "geo/layer_0/stable_rank_o_proj": 46.77499008178711, "geo/layer_0/stable_rank_gate_proj": 129.15582275390625, "geo/layer_0/stable_rank_down_proj": 56.024112701416016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06322398036718369, "geo/layer_0/attn_entropy_mean": 6.183521270751953, "geo/layer_0/attn_entropy_std": 0.4008590877056122, "geo/layer_7/stable_rank_q_proj": 42.897705078125, "geo/layer_7/stable_rank_k_proj": 40.22687530517578, "geo/layer_7/stable_rank_o_proj": 89.43285369873047, "geo/layer_7/stable_rank_gate_proj": 78.59336853027344, "geo/layer_7/stable_rank_down_proj": 139.78765869140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4492989778518677, "geo/layer_7/attn_entropy_mean": 4.763826847076416, "geo/layer_7/attn_entropy_std": 0.7815418243408203, "geo/layer_14/stable_rank_q_proj": 50.32100296020508, "geo/layer_14/stable_rank_k_proj": 41.05976486206055, "geo/layer_14/stable_rank_o_proj": 43.45140838623047, "geo/layer_14/stable_rank_gate_proj": 70.96711730957031, "geo/layer_14/stable_rank_down_proj": 126.67364501953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4164154529571533, "geo/layer_14/attn_entropy_mean": 5.578359603881836, "geo/layer_14/attn_entropy_std": 0.4040682315826416, "geo/layer_21/stable_rank_q_proj": 39.956207275390625, "geo/layer_21/stable_rank_k_proj": 30.143041610717773, "geo/layer_21/stable_rank_o_proj": 68.92224884033203, "geo/layer_21/stable_rank_gate_proj": 64.3749008178711, "geo/layer_21/stable_rank_down_proj": 49.83890151977539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1563320904970169, "geo/layer_21/attn_entropy_mean": 5.909801006317139, "geo/layer_21/attn_entropy_std": 0.2992064952850342, "geo/layer_27/stable_rank_q_proj": 43.59917449951172, "geo/layer_27/stable_rank_k_proj": 32.177406311035156, "geo/layer_27/stable_rank_o_proj": 115.26121520996094, "geo/layer_27/stable_rank_gate_proj": 77.96100616455078, "geo/layer_27/stable_rank_down_proj": 127.83527374267578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.12671928107738495, "geo/layer_27/attn_entropy_mean": 4.360410690307617, "geo/layer_27/attn_entropy_std": 0.6127138733863831, "attnres/final_alpha/block_0": 0.24104811251163483, "attnres/block_norm/0": 1.7865641117095947, "attnres/final_alpha/block_1": 0.003996346611529589, "attnres/block_norm/1": 47934.22265625, "attnres/final_alpha/block_2": 0.00917171873152256, "attnres/block_norm/2": 29151.2421875, "attnres/final_alpha/block_3": 0.01054021529853344, "attnres/block_norm/3": 64982.46875, "attnres/final_alpha/block_4": 0.012557298876345158, "attnres/block_norm/4": 15768.591796875, "attnres/final_alpha/block_5": 0.6211010217666626, "attnres/block_norm/5": 6563.17578125, "attnres/final_alpha/block_6": 0.101585254073143, "attnres/block_norm/6": 42321.82421875, "geo/tier1_time_s": 1.366903305053711, "geo/step": 225.0} {"step": 230, "timestamp": 1778325971.172426, "train/loss": 2.5247085094451904, "train/z_loss": 0.0012886462965980173, "train/perplexity": 12.487254816220142, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.0092, "optim/adamw_lr": 0.000276, "perf/tokens_per_sec": 1701745.8811468051, "perf/iters_per_sec": 0.8114556699499155, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.2323532104492188, "data/tokens_consumed": 484442112, "data/tokens_consumed_B": 0.484442112, "train/loss_slope": -0.0005644321503846538} {"step": 240, "timestamp": 1778325981.5358195, "train/loss": 2.524054193496704, "train/z_loss": 0.0012869407655671239, "train/perplexity": 12.479086878738366, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.0096, "optim/adamw_lr": 0.00028799999999999995, "perf/tokens_per_sec": 2024665.4917181677, "perf/iters_per_sec": 0.9654357393828238, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0358017206192016, "data/tokens_consumed": 505413632, "data/tokens_consumed_B": 0.505413632, "train/loss_slope": -0.00058104612827301} {"step": 250, "timestamp": 1778325991.8900619, "grad/layer_0/attn": 0.0030650619883090258, "grad/layer_0/mlp": 0.003287768689915538, "grad/layer_0/attn_mlp_ratio": 0.9322620245408944, "grad/layer_4/attn": 0.0022782781161367893, "grad/layer_4/mlp": 0.002853329526260495, "grad/layer_4/attn_mlp_ratio": 0.7984629939593224, "grad/layer_8/attn": 0.004292098339647055, "grad/layer_8/mlp": 0.0036666777450591326, "grad/layer_8/attn_mlp_ratio": 1.1705687057919962, "grad/layer_12/attn": 0.007974037900567055, "grad/layer_12/mlp": 0.007517857477068901, "grad/layer_12/attn_mlp_ratio": 1.060679564466558, "grad/layer_16/attn": 0.004452802240848541, "grad/layer_16/mlp": 0.004656312521547079, "grad/layer_16/attn_mlp_ratio": 0.9562936604048462, "grad/layer_20/attn": 0.005177476443350315, "grad/layer_20/mlp": 0.004938199184834957, "grad/layer_20/attn_mlp_ratio": 1.0484543342044073, "grad/layer_24/attn": 0.005329506937414408, "grad/layer_24/mlp": 0.007187082432210445, "grad/layer_24/attn_mlp_ratio": 0.7415396878398272, "grad/layer_27/attn": 0.004095318727195263, "grad/layer_27/mlp": 0.0059900772757828236, "grad/layer_27/attn_mlp_ratio": 0.6836837774002976} {"step": 250, "timestamp": 1778325991.9048462, "train/loss": 2.518563461303711, "train/z_loss": 0.0012900295085273683, "train/perplexity": 12.41075532148708, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.01, "optim/adamw_lr": 0.0003, "perf/tokens_per_sec": 2023892.4051198873, "perf/iters_per_sec": 0.9650671029662549, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0361973762512207, "data/tokens_consumed": 526385152, "data/tokens_consumed_B": 0.526385152, "train/loss_slope": -0.0005938368381598053} {"step": 260, "timestamp": 1778326002.25933, "train/loss": 2.494672417640686, "train/z_loss": 0.0013011571136303246, "train/perplexity": 12.117763302846273, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.010400000000000001, "optim/adamw_lr": 0.000312, "perf/tokens_per_sec": 2026560.243015088, "perf/iters_per_sec": 0.9663392272067489, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0348332881927491, "data/tokens_consumed": 547356672, "data/tokens_consumed_B": 0.547356672, "train/loss_slope": -0.000618236350198077} {"step": 270, "timestamp": 1778326012.6153767, "train/loss": 2.4915273189544678, "train/z_loss": 0.0013114389497786761, "train/perplexity": 12.07971161093186, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.0108, "optim/adamw_lr": 0.000324, "perf/tokens_per_sec": 2026190.6144710495, "perf/iters_per_sec": 0.9661629745822189, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0350220680236817, "data/tokens_consumed": 568328192, "data/tokens_consumed_B": 0.568328192, "train/loss_slope": -0.0006355236949461301} {"step": 280, "timestamp": 1778326022.9871526, "train/loss": 2.521669125556946, "train/z_loss": 0.0013235958176665008, "train/perplexity": 12.44935887445383, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.011200000000000002, "optim/adamw_lr": 0.000336, "perf/tokens_per_sec": 2023629.2385584256, "perf/iters_per_sec": 0.9649416153709534, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.036332130432129, "data/tokens_consumed": 589299712, "data/tokens_consumed_B": 0.589299712, "train/loss_slope": -0.000624302418948394} {"step": 290, "timestamp": 1778326033.3419547, "train/loss": 2.5315951108932495, "train/z_loss": 0.0013065413688309491, "train/perplexity": 12.573546349987431, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.0116, "optim/adamw_lr": 0.00034799999999999995, "perf/tokens_per_sec": 2026427.8371954313, "perf/iters_per_sec": 0.9662760911919743, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349009037017822, "data/tokens_consumed": 610271232, "data/tokens_consumed_B": 0.610271232, "train/loss_slope": -0.0006047489884432747} {"step": 300, "timestamp": 1778326043.6886208, "grad/layer_0/attn": 0.0029793265275657177, "grad/layer_0/mlp": 0.0031357507687062025, "grad/layer_0/attn_mlp_ratio": 0.9501158262598123, "grad/layer_4/attn": 0.00334154162555933, "grad/layer_4/mlp": 0.002838998334482312, "grad/layer_4/attn_mlp_ratio": 1.1770142543839253, "grad/layer_8/attn": 0.005391314160078764, "grad/layer_8/mlp": 0.003831452690064907, "grad/layer_8/attn_mlp_ratio": 1.407120080941264, "grad/layer_12/attn": 0.0052128867246210575, "grad/layer_12/mlp": 0.007554628420621157, "grad/layer_12/attn_mlp_ratio": 0.6900255532607503, "grad/layer_16/attn": 0.0038710825610905886, "grad/layer_16/mlp": 0.004462788347154856, "grad/layer_16/attn_mlp_ratio": 0.8674134135931326, "grad/layer_20/attn": 0.0032421902287751436, "grad/layer_20/mlp": 0.006207099184393883, "grad/layer_20/attn_mlp_ratio": 0.5223358094056555, "grad/layer_24/attn": 0.012916596606373787, "grad/layer_24/mlp": 0.009811921045184135, "grad/layer_24/attn_mlp_ratio": 1.3164187130380152, "grad/layer_27/attn": 0.008568797260522842, "grad/layer_27/mlp": 0.00832746084779501, "grad/layer_27/attn_mlp_ratio": 1.0289807798848623} {"step": 300, "timestamp": 1778326044.300374, "eos/sharpness": 58.489871025085435, "eos/L0_probe": 2.508732318878174, "eos/L_plus": 2.757418155670166, "eos/L_minus": 2.844945192337036, "eos/grad_norm": 0.14983607828617096, "eos/embed_grad_frac": 0.11495473980903625, "eos/time_s": 0.6088368892669678} {"step": 300, "timestamp": 1778326044.3218284, "train/loss": 2.482262372970581, "train/z_loss": 0.0013232023105956615, "train/perplexity": 11.968310594436101, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.012, "optim/adamw_lr": 0.00035999999999999997, "perf/tokens_per_sec": 1910881.0276694645, "perf/iters_per_sec": 0.9111790788981745, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0974791049957275, "data/tokens_consumed": 631242752, "data/tokens_consumed_B": 0.631242752, "train/loss_slope": -0.0006149241530126138} {"step": 300, "timestamp": 1778326045.6892183, "geo/rankme_last": 418.9786071777344, "geo/layer_0/stable_rank_q_proj": 19.271690368652344, "geo/layer_0/stable_rank_k_proj": 15.996881484985352, "geo/layer_0/stable_rank_o_proj": 46.73983383178711, "geo/layer_0/stable_rank_gate_proj": 129.0180206298828, "geo/layer_0/stable_rank_down_proj": 55.93442153930664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06495006382465363, "geo/layer_0/attn_entropy_mean": 6.19115686416626, "geo/layer_0/attn_entropy_std": 0.3977494239807129, "geo/layer_7/stable_rank_q_proj": 42.886295318603516, "geo/layer_7/stable_rank_k_proj": 40.231605529785156, "geo/layer_7/stable_rank_o_proj": 89.461181640625, "geo/layer_7/stable_rank_gate_proj": 78.61244201660156, "geo/layer_7/stable_rank_down_proj": 140.0493621826172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4383857250213623, "geo/layer_7/attn_entropy_mean": 4.744980812072754, "geo/layer_7/attn_entropy_std": 0.772651195526123, "geo/layer_14/stable_rank_q_proj": 50.30234146118164, "geo/layer_14/stable_rank_k_proj": 41.07728958129883, "geo/layer_14/stable_rank_o_proj": 43.43040466308594, "geo/layer_14/stable_rank_gate_proj": 71.02947998046875, "geo/layer_14/stable_rank_down_proj": 126.78021240234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4078315496444702, "geo/layer_14/attn_entropy_mean": 5.57923698425293, "geo/layer_14/attn_entropy_std": 0.39664819836616516, "geo/layer_21/stable_rank_q_proj": 39.99577713012695, "geo/layer_21/stable_rank_k_proj": 30.14080810546875, "geo/layer_21/stable_rank_o_proj": 68.86211395263672, "geo/layer_21/stable_rank_gate_proj": 64.32102966308594, "geo/layer_21/stable_rank_down_proj": 49.846229553222656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15174080431461334, "geo/layer_21/attn_entropy_mean": 5.916573524475098, "geo/layer_21/attn_entropy_std": 0.3124845027923584, "geo/layer_27/stable_rank_q_proj": 43.572303771972656, "geo/layer_27/stable_rank_k_proj": 32.149105072021484, "geo/layer_27/stable_rank_o_proj": 115.21073150634766, "geo/layer_27/stable_rank_gate_proj": 77.87166595458984, "geo/layer_27/stable_rank_down_proj": 127.8044204711914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.12393442541360855, "geo/layer_27/attn_entropy_mean": 4.363925933837891, "geo/layer_27/attn_entropy_std": 0.6445966958999634, "attnres/final_alpha/block_0": 0.24422504007816315, "attnres/block_norm/0": 1.7868996858596802, "attnres/final_alpha/block_1": 0.0039992500096559525, "attnres/block_norm/1": 48110.703125, "attnres/final_alpha/block_2": 0.00915153045207262, "attnres/block_norm/2": 29109.8984375, "attnres/final_alpha/block_3": 0.010709408670663834, "attnres/block_norm/3": 65032.2265625, "attnres/final_alpha/block_4": 0.012625868432223797, "attnres/block_norm/4": 15850.4072265625, "attnres/final_alpha/block_5": 0.6161125302314758, "attnres/block_norm/5": 6621.435546875, "attnres/final_alpha/block_6": 0.10317639261484146, "attnres/block_norm/6": 42811.9375, "geo/tier1_time_s": 1.3632557392120361, "geo/step": 300.0, "geo/rankme_slope": 0.011155680338541666} {"step": 310, "timestamp": 1778326056.0476449, "train/loss": 2.484754753112793, "train/z_loss": 0.0013229163130745291, "train/perplexity": 11.998177378326105, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.0124, "optim/adamw_lr": 0.000372, "perf/tokens_per_sec": 1789101.2469285487, "perf/iters_per_sec": 0.8531099543230766, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1721818447113037, "data/tokens_consumed": 652214272, "data/tokens_consumed_B": 0.652214272, "train/loss_slope": -0.0006183970887814795} {"step": 320, "timestamp": 1778326066.400363, "train/loss": 2.463829827308655, "train/z_loss": 0.0013292433694005012, "train/perplexity": 11.749724897162764, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.0128, "optim/adamw_lr": 0.00038399999999999996, "perf/tokens_per_sec": 2026918.9835058101, "perf/iters_per_sec": 0.9665102880028773, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0346501350402832, "data/tokens_consumed": 673185792, "data/tokens_consumed_B": 0.673185792, "train/loss_slope": -0.0006291585889729582} {"step": 330, "timestamp": 1778326076.758906, "train/loss": 2.53429057598114, "train/z_loss": 0.0013197672087699175, "train/perplexity": 12.607483623024429, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.013200000000000002, "optim/adamw_lr": 0.000396, "perf/tokens_per_sec": 2025644.4012430268, "perf/iters_per_sec": 0.9659025198187956, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0353011608123779, "data/tokens_consumed": 694157312, "data/tokens_consumed_B": 0.694157312, "train/loss_slope": -0.0005994308641432255} {"step": 340, "timestamp": 1778326087.1157124, "train/loss": 2.494998908042908, "train/z_loss": 0.0013312191353179515, "train/perplexity": 12.121720282183784, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.013600000000000001, "optim/adamw_lr": 0.000408, "perf/tokens_per_sec": 2026192.90147732, "perf/iters_per_sec": 0.9661640651117896, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035020899772644, "data/tokens_consumed": 715128832, "data/tokens_consumed_B": 0.715128832, "train/loss_slope": -0.0005903720849034495} {"step": 350, "timestamp": 1778326097.4568696, "grad/layer_0/attn": 0.002828846452757716, "grad/layer_0/mlp": 0.0032467651180922985, "grad/layer_0/attn_mlp_ratio": 0.8712814948842716, "grad/layer_4/attn": 0.0020457631908357143, "grad/layer_4/mlp": 0.0030221708584576845, "grad/layer_4/attn_mlp_ratio": 0.6769184202205875, "grad/layer_8/attn": 0.003973857033997774, "grad/layer_8/mlp": 0.003650758881121874, "grad/layer_8/attn_mlp_ratio": 1.0885016114585062, "grad/layer_12/attn": 0.006177008617669344, "grad/layer_12/mlp": 0.007152685895562172, "grad/layer_12/attn_mlp_ratio": 0.8635928686792376, "grad/layer_16/attn": 0.005226531531661749, "grad/layer_16/mlp": 0.004473648965358734, "grad/layer_16/attn_mlp_ratio": 1.1682926969245055, "grad/layer_20/attn": 0.003036975394934416, "grad/layer_20/mlp": 0.005730479024350643, "grad/layer_20/attn_mlp_ratio": 0.5299688436224004, "grad/layer_24/attn": 0.004934411961585283, "grad/layer_24/mlp": 0.008484485559165478, "grad/layer_24/attn_mlp_ratio": 0.5815805647870734, "grad/layer_27/attn": 0.009487307630479336, "grad/layer_27/mlp": 0.0070732212625443935, "grad/layer_27/attn_mlp_ratio": 1.3412994086002339} {"step": 350, "timestamp": 1778326097.4725468, "train/loss": 2.5055256128311156, "train/z_loss": 0.0013159862952306866, "train/perplexity": 12.249996029045294, "train/grad_norm": 0.09375, "optim/muon_lr": 0.013999999999999999, "optim/adamw_lr": 0.00041999999999999996, "perf/tokens_per_sec": 2025853.6875140376, "perf/iters_per_sec": 0.9660023152895153, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035194206237793, "data/tokens_consumed": 736100352, "data/tokens_consumed_B": 0.736100352, "train/loss_slope": -0.000575340398950466} {"step": 360, "timestamp": 1778326107.8247988, "train/loss": 2.4806451320648195, "train/z_loss": 0.0013440450769849122, "train/perplexity": 11.948970595903553, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.0144, "optim/adamw_lr": 0.00043199999999999993, "perf/tokens_per_sec": 2027277.6174088665, "perf/iters_per_sec": 0.966681297974046, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034467101097107, "data/tokens_consumed": 757071872, "data/tokens_consumed_B": 0.757071872, "train/loss_slope": -0.0005707803276716783} {"step": 370, "timestamp": 1778326118.1757133, "train/loss": 2.5137316465377806, "train/z_loss": 0.0013257209793664515, "train/perplexity": 12.35093349105566, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.0148, "optim/adamw_lr": 0.00044399999999999995, "perf/tokens_per_sec": 2027212.6736345412, "perf/iters_per_sec": 0.9666503303692537, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034500241279602, "data/tokens_consumed": 778043392, "data/tokens_consumed_B": 0.778043392, "train/loss_slope": -0.0005511971424752686} {"step": 375, "timestamp": 1778326123.9526982, "eos/sharpness": 4.241514205932616, "eos/L0_probe": 2.4949443340301514, "eos/L_plus": 2.517460584640503, "eos/L_minus": 2.514843225479126, "eos/grad_norm": 0.08104485273361206, "eos/embed_grad_frac": 0.32728007435798645, "eos/time_s": 0.600139856338501} {"step": 375, "timestamp": 1778326125.3323832, "geo/rankme_last": 419.32476806640625, "geo/layer_0/stable_rank_q_proj": 19.337526321411133, "geo/layer_0/stable_rank_k_proj": 16.102191925048828, "geo/layer_0/stable_rank_o_proj": 46.6667594909668, "geo/layer_0/stable_rank_gate_proj": 128.8767852783203, "geo/layer_0/stable_rank_down_proj": 55.92224884033203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06703102588653564, "geo/layer_0/attn_entropy_mean": 6.19887638092041, "geo/layer_0/attn_entropy_std": 0.3963252604007721, "geo/layer_7/stable_rank_q_proj": 42.88713455200195, "geo/layer_7/stable_rank_k_proj": 40.2511100769043, "geo/layer_7/stable_rank_o_proj": 89.4981689453125, "geo/layer_7/stable_rank_gate_proj": 78.64375305175781, "geo/layer_7/stable_rank_down_proj": 140.50949096679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4357060194015503, "geo/layer_7/attn_entropy_mean": 4.75469970703125, "geo/layer_7/attn_entropy_std": 0.7898223400115967, "geo/layer_14/stable_rank_q_proj": 50.28725051879883, "geo/layer_14/stable_rank_k_proj": 41.075679779052734, "geo/layer_14/stable_rank_o_proj": 43.45069885253906, "geo/layer_14/stable_rank_gate_proj": 71.05696868896484, "geo/layer_14/stable_rank_down_proj": 126.65505981445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38645824790000916, "geo/layer_14/attn_entropy_mean": 5.571761131286621, "geo/layer_14/attn_entropy_std": 0.3995281755924225, "geo/layer_21/stable_rank_q_proj": 40.0295524597168, "geo/layer_21/stable_rank_k_proj": 30.16984748840332, "geo/layer_21/stable_rank_o_proj": 68.73171997070312, "geo/layer_21/stable_rank_gate_proj": 64.22355651855469, "geo/layer_21/stable_rank_down_proj": 49.82157897949219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14403022825717926, "geo/layer_21/attn_entropy_mean": 5.896753311157227, "geo/layer_21/attn_entropy_std": 0.2996179461479187, "geo/layer_27/stable_rank_q_proj": 43.5781364440918, "geo/layer_27/stable_rank_k_proj": 32.07975769042969, "geo/layer_27/stable_rank_o_proj": 115.16568756103516, "geo/layer_27/stable_rank_gate_proj": 77.78457641601562, "geo/layer_27/stable_rank_down_proj": 127.69671630859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11451277881860733, "geo/layer_27/attn_entropy_mean": 4.368821144104004, "geo/layer_27/attn_entropy_std": 0.6409921646118164, "attnres/final_alpha/block_0": 0.24503448605537415, "attnres/block_norm/0": 1.7872804403305054, "attnres/final_alpha/block_1": 0.00397389056161046, "attnres/block_norm/1": 48112.171875, "attnres/final_alpha/block_2": 0.009123943746089935, "attnres/block_norm/2": 29304.2890625, "attnres/final_alpha/block_3": 0.010730898939073086, "attnres/block_norm/3": 65257.9609375, "attnres/final_alpha/block_4": 0.012557446025311947, "attnres/block_norm/4": 15843.267578125, "attnres/final_alpha/block_5": 0.6152273416519165, "attnres/block_norm/5": 6640.23046875, "attnres/final_alpha/block_6": 0.10335206240415573, "attnres/block_norm/6": 42907.1171875, "geo/tier1_time_s": 1.3597450256347656, "geo/step": 375.0, "geo/rankme_slope": 0.008286504836309523} {"step": 380, "timestamp": 1778326130.51007, "train/loss": 2.4782739162445067, "train/z_loss": 0.001327402878087014, "train/perplexity": 11.920670573780306, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.0152, "optim/adamw_lr": 0.00045599999999999997, "perf/tokens_per_sec": 1700948.0368786233, "perf/iters_per_sec": 0.811075228156387, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.2329312562942505, "data/tokens_consumed": 799014912, "data/tokens_consumed_B": 0.799014912, "train/loss_slope": -0.0005459937146800731} {"step": 390, "timestamp": 1778326140.8676512, "train/loss": 2.5143774509429933, "train/z_loss": 0.0013356372714042664, "train/perplexity": 12.358912354427922, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.015600000000000001, "optim/adamw_lr": 0.000468, "perf/tokens_per_sec": 2026289.2404713868, "perf/iters_per_sec": 0.9662100031239447, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349716901779176, "data/tokens_consumed": 819986432, "data/tokens_consumed_B": 0.819986432, "train/loss_slope": -0.0005263265789561601} {"step": 400, "timestamp": 1778326151.210008, "grad/layer_0/attn": 0.005552140064537525, "grad/layer_0/mlp": 0.0054610599763691425, "grad/layer_0/attn_mlp_ratio": 1.0166780784123763, "grad/layer_4/attn": 0.009155907668173313, "grad/layer_4/mlp": 0.003919566050171852, "grad/layer_4/attn_mlp_ratio": 2.335949265143, "grad/layer_8/attn": 0.006160959601402283, "grad/layer_8/mlp": 0.003983616828918457, "grad/layer_8/attn_mlp_ratio": 1.5465743095621822, "grad/layer_12/attn": 0.007608439307659864, "grad/layer_12/mlp": 0.00898154079914093, "grad/layer_12/attn_mlp_ratio": 0.8471195970824561, "grad/layer_16/attn": 0.007493716664612293, "grad/layer_16/mlp": 0.006204414181411266, "grad/layer_16/attn_mlp_ratio": 1.2078040447853138, "grad/layer_20/attn": 0.009119280613958836, "grad/layer_20/mlp": 0.011484574526548386, "grad/layer_20/attn_mlp_ratio": 0.7940460060991894, "grad/layer_24/attn": 0.01122645940631628, "grad/layer_24/mlp": 0.01635892502963543, "grad/layer_24/attn_mlp_ratio": 0.6862589881274473, "grad/layer_27/attn": 0.015058170072734356, "grad/layer_27/mlp": 0.013535687699913979, "grad/layer_27/attn_mlp_ratio": 1.1124791215138734} {"step": 400, "timestamp": 1778326151.2256112, "train/loss": 2.4565050840377807, "train/z_loss": 0.0013520325999706983, "train/perplexity": 11.663975607936028, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.016, "optim/adamw_lr": 0.00047999999999999996, "perf/tokens_per_sec": 2025874.8704782713, "perf/iters_per_sec": 0.9660124161139828, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351833820343017, "data/tokens_consumed": 840957952, "data/tokens_consumed_B": 0.840957952, "train/loss_slope": -0.0005277312046974793} {"step": 410, "timestamp": 1778326162.2335057, "train/loss": 2.5063090085983277, "train/z_loss": 0.0013389808242209255, "train/perplexity": 12.259596384030557, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.016399999999999998, "optim/adamw_lr": 0.0004919999999999999, "perf/tokens_per_sec": 1906261.683142783, "perf/iters_per_sec": 0.9089764037813105, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1001385688781737, "data/tokens_consumed": 861929472, "data/tokens_consumed_B": 0.861929472, "train/loss_slope": -0.0005106450885246912} {"step": 420, "timestamp": 1778326172.5892599, "train/loss": 2.507884907722473, "train/z_loss": 0.0013283996027894317, "train/perplexity": 12.278931502331027, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.0168, "optim/adamw_lr": 0.0005039999999999999, "perf/tokens_per_sec": 2026348.383335723, "perf/iters_per_sec": 0.9662382046393028, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349414825439454, "data/tokens_consumed": 882900992, "data/tokens_consumed_B": 0.882900992, "train/loss_slope": -0.0004937155745030169} {"step": 430, "timestamp": 1778326182.9431381, "train/loss": 2.494223403930664, "train/z_loss": 0.0013403997058048845, "train/perplexity": 12.112323482356876, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.0172, "optim/adamw_lr": 0.000516, "perf/tokens_per_sec": 2026755.5685149387, "perf/iters_per_sec": 0.9664323656630224, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347335577011108, "data/tokens_consumed": 903872512, "data/tokens_consumed_B": 0.903872512, "train/loss_slope": -0.0004816357692922817} {"step": 440, "timestamp": 1778326193.2976625, "train/loss": 2.514746832847595, "train/z_loss": 0.0013355616247281432, "train/perplexity": 12.363478356259517, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.0176, "optim/adamw_lr": 0.0005279999999999999, "perf/tokens_per_sec": 2026489.3692008022, "perf/iters_per_sec": 0.9663054319385539, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0348694801330567, "data/tokens_consumed": 924844032, "data/tokens_consumed_B": 0.924844032, "train/loss_slope": -0.00046375171123600134} {"step": 450, "timestamp": 1778326203.6363819, "grad/layer_0/attn": 0.0032757248263806105, "grad/layer_0/mlp": 0.0034465533681213856, "grad/layer_0/attn_mlp_ratio": 0.9504349364311795, "grad/layer_4/attn": 0.003607620717957616, "grad/layer_4/mlp": 0.0029107355512678623, "grad/layer_4/attn_mlp_ratio": 1.2394188789992673, "grad/layer_8/attn": 0.003686003852635622, "grad/layer_8/mlp": 0.003757528029382229, "grad/layer_8/attn_mlp_ratio": 0.9809650721740928, "grad/layer_12/attn": 0.007308033760637045, "grad/layer_12/mlp": 0.007819822989404202, "grad/layer_12/attn_mlp_ratio": 0.9345523136628722, "grad/layer_16/attn": 0.004973405972123146, "grad/layer_16/mlp": 0.005057013593614101, "grad/layer_16/attn_mlp_ratio": 0.9834669774383776, "grad/layer_20/attn": 0.005058463662862778, "grad/layer_20/mlp": 0.006759467534720898, "grad/layer_20/attn_mlp_ratio": 0.7483523756929187, "grad/layer_24/attn": 0.007780601270496845, "grad/layer_24/mlp": 0.008109810762107372, "grad/layer_24/attn_mlp_ratio": 0.959406008696363, "grad/layer_27/attn": 0.00587126798927784, "grad/layer_27/mlp": 0.009149993769824505, "grad/layer_27/attn_mlp_ratio": 0.6416690626034758} {"step": 450, "timestamp": 1778326204.2431266, "eos/sharpness": 12.00277805328369, "eos/L0_probe": 2.4852025508880615, "eos/L_plus": 2.5410661697387695, "eos/L_minus": 2.5493667125701904, "eos/grad_norm": 0.1269795447587967, "eos/embed_grad_frac": 0.2793909013271332, "eos/time_s": 0.6040008068084717} {"step": 450, "timestamp": 1778326204.2636876, "train/loss": 2.5366223573684694, "train/z_loss": 0.0013271401170641185, "train/perplexity": 12.636915820066545, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.018000000000000002, "optim/adamw_lr": 0.00054, "perf/tokens_per_sec": 1913148.7132901617, "perf/iters_per_sec": 0.9122603956652459, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0961782455444335, "data/tokens_consumed": 945815552, "data/tokens_consumed_B": 0.945815552, "train/loss_slope": -0.00044074317154015356} {"step": 450, "timestamp": 1778326205.629177, "geo/rankme_last": 420.9298400878906, "geo/layer_0/stable_rank_q_proj": 19.430694580078125, "geo/layer_0/stable_rank_k_proj": 16.23636817932129, "geo/layer_0/stable_rank_o_proj": 46.618289947509766, "geo/layer_0/stable_rank_gate_proj": 128.79290771484375, "geo/layer_0/stable_rank_down_proj": 55.814300537109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0666998103260994, "geo/layer_0/attn_entropy_mean": 6.212673187255859, "geo/layer_0/attn_entropy_std": 0.3908732831478119, "geo/layer_7/stable_rank_q_proj": 42.87285614013672, "geo/layer_7/stable_rank_k_proj": 40.19947814941406, "geo/layer_7/stable_rank_o_proj": 89.44734191894531, "geo/layer_7/stable_rank_gate_proj": 78.72080993652344, "geo/layer_7/stable_rank_down_proj": 140.67852783203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42995014786720276, "geo/layer_7/attn_entropy_mean": 4.740608215332031, "geo/layer_7/attn_entropy_std": 0.7869494557380676, "geo/layer_14/stable_rank_q_proj": 50.33417892456055, "geo/layer_14/stable_rank_k_proj": 41.07701873779297, "geo/layer_14/stable_rank_o_proj": 43.434085845947266, "geo/layer_14/stable_rank_gate_proj": 71.11089324951172, "geo/layer_14/stable_rank_down_proj": 126.47207641601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4009270966053009, "geo/layer_14/attn_entropy_mean": 5.569405555725098, "geo/layer_14/attn_entropy_std": 0.3894648849964142, "geo/layer_21/stable_rank_q_proj": 40.019935607910156, "geo/layer_21/stable_rank_k_proj": 30.151334762573242, "geo/layer_21/stable_rank_o_proj": 68.669677734375, "geo/layer_21/stable_rank_gate_proj": 64.20284271240234, "geo/layer_21/stable_rank_down_proj": 49.830894470214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14914967119693756, "geo/layer_21/attn_entropy_mean": 5.90501594543457, "geo/layer_21/attn_entropy_std": 0.3003038465976715, "geo/layer_27/stable_rank_q_proj": 43.571044921875, "geo/layer_27/stable_rank_k_proj": 32.06731033325195, "geo/layer_27/stable_rank_o_proj": 115.39905548095703, "geo/layer_27/stable_rank_gate_proj": 77.59227752685547, "geo/layer_27/stable_rank_down_proj": 127.71318054199219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1123330369591713, "geo/layer_27/attn_entropy_mean": 4.35097599029541, "geo/layer_27/attn_entropy_std": 0.6546382308006287, "attnres/final_alpha/block_0": 0.24550071358680725, "attnres/block_norm/0": 1.7875040769577026, "attnres/final_alpha/block_1": 0.004006864037364721, "attnres/block_norm/1": 48031.5703125, "attnres/final_alpha/block_2": 0.009024253115057945, "attnres/block_norm/2": 29200.166015625, "attnres/final_alpha/block_3": 0.010592347010970116, "attnres/block_norm/3": 64909.84375, "attnres/final_alpha/block_4": 0.012480679899454117, "attnres/block_norm/4": 15936.4697265625, "attnres/final_alpha/block_5": 0.6169427633285522, "attnres/block_norm/5": 6697.177734375, "attnres/final_alpha/block_6": 0.10145237296819687, "attnres/block_norm/6": 43326.53125, "geo/tier1_time_s": 1.3620333671569824, "geo/step": 450.0, "geo/rankme_slope": 0.008666919526599702} {"step": 460, "timestamp": 1778326215.9809647, "train/loss": 2.4782485723495484, "train/z_loss": 0.0013322774437256157, "train/perplexity": 11.920368461385822, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.0184, "optim/adamw_lr": 0.000552, "perf/tokens_per_sec": 1790304.5211891779, "perf/iters_per_sec": 0.853683720202054, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1713940143585204, "data/tokens_consumed": 966787072, "data/tokens_consumed_B": 0.966787072, "train/loss_slope": -0.00043490225991973377} {"step": 470, "timestamp": 1778326226.3419242, "train/loss": 2.4679965019226073, "train/z_loss": 0.0013416853500530124, "train/perplexity": 11.798784313948529, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.0188, "optim/adamw_lr": 0.0005639999999999999, "perf/tokens_per_sec": 2025073.9122768715, "perf/iters_per_sec": 0.9656304894813879, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0355928182601928, "data/tokens_consumed": 987758592, "data/tokens_consumed_B": 0.987758592, "train/loss_slope": -0.0004312675412302684} {"step": 480, "timestamp": 1778326236.6923335, "train/loss": 2.4683453321456907, "train/z_loss": 0.001344849867746234, "train/perplexity": 11.802900804449285, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.0192, "optim/adamw_lr": 0.0005759999999999999, "perf/tokens_per_sec": 2027195.6207432742, "perf/iters_per_sec": 0.9666421989170428, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0345089435577393, "data/tokens_consumed": 1008730112, "data/tokens_consumed_B": 1.008730112, "train/loss_slope": -0.00042691849421481695} {"step": 490, "timestamp": 1778326247.044468, "train/loss": 2.5142488956451414, "train/z_loss": 0.0013365906546823681, "train/perplexity": 12.357323652889262, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.0196, "optim/adamw_lr": 0.000588, "perf/tokens_per_sec": 2026801.2884310503, "perf/iters_per_sec": 0.9664541666178943, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347102165222168, "data/tokens_consumed": 1029701632, "data/tokens_consumed_B": 1.029701632, "train/loss_slope": -0.0004112655141631238} {"step": 500, "timestamp": 1778326257.3958783, "grad/layer_0/attn": 0.003160040592774749, "grad/layer_0/mlp": 0.0034124928060919046, "grad/layer_0/attn_mlp_ratio": 0.9260211463395327, "grad/layer_4/attn": 0.003992364276200533, "grad/layer_4/mlp": 0.0031085037626326084, "grad/layer_4/attn_mlp_ratio": 1.2843362764295807, "grad/layer_8/attn": 0.004779738839715719, "grad/layer_8/mlp": 0.0038937083445489407, "grad/layer_8/attn_mlp_ratio": 1.2275543759336143, "grad/layer_12/attn": 0.00881215464323759, "grad/layer_12/mlp": 0.00897503737360239, "grad/layer_12/attn_mlp_ratio": 0.9818515709996894, "grad/layer_16/attn": 0.008117912337183952, "grad/layer_16/mlp": 0.005874824244529009, "grad/layer_16/attn_mlp_ratio": 1.3818136272863788, "grad/layer_20/attn": 0.00581567594781518, "grad/layer_20/mlp": 0.009900709614157677, "grad/layer_20/attn_mlp_ratio": 0.5873999052309313, "grad/layer_24/attn": 0.015169654972851276, "grad/layer_24/mlp": 0.014957564882934093, "grad/layer_24/attn_mlp_ratio": 1.0141794463309481, "grad/layer_27/attn": 0.006853190250694752, "grad/layer_27/mlp": 0.015509629622101784, "grad/layer_27/attn_mlp_ratio": 0.4418667868600827} {"step": 500, "timestamp": 1778326257.4114137, "train/loss": 2.469688606262207, "train/z_loss": 0.0013550536474213003, "train/perplexity": 11.818765988859925, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023930.963931336, "perf/iters_per_sec": 0.9650854892403298, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.036177635192871, "data/tokens_consumed": 1050673152, "data/tokens_consumed_B": 1.050673152, "train/loss_slope": -0.0004065342816831839} {"step": 500, "timestamp": 1778326264.5300286, "geo/ww_alpha_mean": 7.665439200871858, "geo/ww_alpha_std": 4.583107120101462, "geo/ww_alpha_min": 1.35224235945669, "geo/ww_alpha_max": 29.164359098117295, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9259560873591552, "geo/ww_alpha_by_type/k_proj": 4.546924867510752, "geo/ww_alpha_by_type/v_proj": 9.512762475358343, "geo/ww_alpha_by_type/o_proj": 8.158552472171479, "geo/ww_alpha_by_type/gate_proj": 7.7809737697663275, "geo/ww_alpha_by_type/up_proj": 11.55724550487979, "geo/ww_alpha_by_type/down_proj": 8.266825138067446, "geo/twonn_id/layer_0": 0.7570964694023132, "geo/twonn_id/layer_7": 3.517218828201294, "geo/twonn_id/layer_14": 5.1876959800720215, "geo/twonn_id/layer_21": 7.774787902832031, "geo/twonn_id/layer_27": 6.7667622566223145, "geo/tier2_time_s": 7.111745357513428} {"step": 500, "timestamp": 1778326265.3149836, "eoc/jacobian_sigma/layer_0/attn": 1294.6851806640625, "eoc/jacobian_sigma/layer_0/mlp": 8665.0224609375, "eoc/jacobian_sigma/layer_0": 8665.0224609375, "eoc/jacobian_sigma/layer_7/attn": 1.032926321029663, "eoc/jacobian_sigma/layer_7/mlp": 1.5337871313095093, "eoc/jacobian_sigma/layer_7": 1.5337871313095093, "eoc/jacobian_sigma/layer_14/attn": 1.4857351779937744, "eoc/jacobian_sigma/layer_14/mlp": 11.387519836425781, "eoc/jacobian_sigma/layer_14": 11.387519836425781, "eoc/jacobian_sigma/layer_21/attn": 1.0104278326034546, "eoc/jacobian_sigma/layer_21/mlp": 5.00540018081665, "eoc/jacobian_sigma/layer_21": 5.00540018081665, "eoc/jacobian_sigma/layer_27/attn": 3.165970802307129, "eoc/jacobian_sigma/layer_27/mlp": 24.633411407470703, "eoc/jacobian_sigma/layer_27": 24.633411407470703, "eoc/layer0_sigma": 8665.0224609375, "eoc/sigma_max": 24.633411407470703, "eoc/sigma_min": 1.5337871313095093, "eoc/sigma_mean": 10.640029639005661, "eoc/time_s": 0.7786083221435547} {"step": 510, "timestamp": 1778326275.690733, "train/loss": 2.465353083610535, "train/z_loss": 0.0013431615778245031, "train/perplexity": 11.767636378082468, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1147638.2839221086, "perf/iters_per_sec": 0.5472365779505294, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.8273632287979127, "data/tokens_consumed": 1071644672, "data/tokens_consumed_B": 1.071644672, "train/loss_slope": -0.00040238718020461044} {"step": 520, "timestamp": 1778326286.0574412, "train/loss": 2.4832303047180178, "train/z_loss": 0.0013498322339728474, "train/perplexity": 11.979900710539415, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024604.9558306579, "perf/iters_per_sec": 0.9654068736222543, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035832691192627, "data/tokens_consumed": 1092616192, "data/tokens_consumed_B": 1.092616192, "train/loss_slope": -0.00039410074955608504} {"step": 525, "timestamp": 1778326291.8462787, "eos/sharpness": 58.67962837219237, "eos/L0_probe": 2.4816341400146484, "eos/L_plus": 2.7440130710601807, "eos/L_minus": 2.80605149269104, "eos/grad_norm": 0.21048037707805634, "eos/embed_grad_frac": 0.05803400278091431, "eos/time_s": 0.6075232028961182} {"step": 525, "timestamp": 1778326293.2271926, "geo/rankme_last": 419.0769348144531, "geo/layer_0/stable_rank_q_proj": 19.533153533935547, "geo/layer_0/stable_rank_k_proj": 16.367761611938477, "geo/layer_0/stable_rank_o_proj": 46.56875991821289, "geo/layer_0/stable_rank_gate_proj": 128.7152557373047, "geo/layer_0/stable_rank_down_proj": 55.7530403137207, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06476879119873047, "geo/layer_0/attn_entropy_mean": 6.223019599914551, "geo/layer_0/attn_entropy_std": 0.39410707354545593, "geo/layer_7/stable_rank_q_proj": 42.85928726196289, "geo/layer_7/stable_rank_k_proj": 40.1683349609375, "geo/layer_7/stable_rank_o_proj": 89.58985137939453, "geo/layer_7/stable_rank_gate_proj": 78.76690673828125, "geo/layer_7/stable_rank_down_proj": 140.89830017089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42967331409454346, "geo/layer_7/attn_entropy_mean": 4.690962314605713, "geo/layer_7/attn_entropy_std": 0.8005928993225098, "geo/layer_14/stable_rank_q_proj": 50.322383880615234, "geo/layer_14/stable_rank_k_proj": 41.06553649902344, "geo/layer_14/stable_rank_o_proj": 43.41328811645508, "geo/layer_14/stable_rank_gate_proj": 71.13230895996094, "geo/layer_14/stable_rank_down_proj": 126.36808776855469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38171857595443726, "geo/layer_14/attn_entropy_mean": 5.560334205627441, "geo/layer_14/attn_entropy_std": 0.37791678309440613, "geo/layer_21/stable_rank_q_proj": 40.026939392089844, "geo/layer_21/stable_rank_k_proj": 30.15130615234375, "geo/layer_21/stable_rank_o_proj": 68.62202453613281, "geo/layer_21/stable_rank_gate_proj": 64.17977905273438, "geo/layer_21/stable_rank_down_proj": 49.820213317871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1409897804260254, "geo/layer_21/attn_entropy_mean": 5.904951095581055, "geo/layer_21/attn_entropy_std": 0.29603224992752075, "geo/layer_27/stable_rank_q_proj": 43.582542419433594, "geo/layer_27/stable_rank_k_proj": 31.96814727783203, "geo/layer_27/stable_rank_o_proj": 115.35890197753906, "geo/layer_27/stable_rank_gate_proj": 77.29700469970703, "geo/layer_27/stable_rank_down_proj": 127.4812240600586, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.12180514633655548, "geo/layer_27/attn_entropy_mean": 4.357919692993164, "geo/layer_27/attn_entropy_std": 0.6585018038749695, "attnres/final_alpha/block_0": 0.24623551964759827, "attnres/block_norm/0": 1.7871575355529785, "attnres/final_alpha/block_1": 0.003988150041550398, "attnres/block_norm/1": 48310.74609375, "attnres/final_alpha/block_2": 0.009019474498927593, "attnres/block_norm/2": 29240.765625, "attnres/final_alpha/block_3": 0.010876642540097237, "attnres/block_norm/3": 65180.88671875, "attnres/final_alpha/block_4": 0.012582958675920963, "attnres/block_norm/4": 15915.7216796875, "attnres/final_alpha/block_5": 0.6139209270477295, "attnres/block_norm/5": 6729.7919921875, "attnres/final_alpha/block_6": 0.10337630659341812, "attnres/block_norm/6": 43699.1796875, "geo/tier1_time_s": 1.3595695495605469, "geo/step": 525.0, "geo/rankme_slope": 0.0060443987165178575} {"step": 530, "timestamp": 1778326298.4089534, "train/loss": 2.4614047527313234, "train/z_loss": 0.0013392986613325775, "train/perplexity": 11.721265460100131, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698656.2107219838, "perf/iters_per_sec": 0.8099824002847594, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.2345947265625, "data/tokens_consumed": 1113587712, "data/tokens_consumed_B": 1.113587712, "train/loss_slope": -0.00039031459218732777} {"step": 540, "timestamp": 1778326308.7626584, "train/loss": 2.4526878356933595, "train/z_loss": 0.0013592820963822306, "train/perplexity": 11.619536188469436, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026280.9784834087, "perf/iters_per_sec": 0.9662060635010761, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349759101867675, "data/tokens_consumed": 1134559232, "data/tokens_consumed_B": 1.134559232, "train/loss_slope": -0.00038786446652543143} {"step": 550, "timestamp": 1778326319.1162593, "grad/layer_0/attn": 0.003711764235049486, "grad/layer_0/mlp": 0.003961602225899696, "grad/layer_0/attn_mlp_ratio": 0.9369350908300795, "grad/layer_4/attn": 0.0026644468307495117, "grad/layer_4/mlp": 0.0028269116301089525, "grad/layer_4/attn_mlp_ratio": 0.9425291926772756, "grad/layer_8/attn": 0.005658328533172607, "grad/layer_8/mlp": 0.0037742157001048326, "grad/layer_8/attn_mlp_ratio": 1.4992064134264504, "grad/layer_12/attn": 0.005623759236186743, "grad/layer_12/mlp": 0.007397412788122892, "grad/layer_12/attn_mlp_ratio": 0.7602332492777462, "grad/layer_16/attn": 0.003404578659683466, "grad/layer_16/mlp": 0.004499522037804127, "grad/layer_16/attn_mlp_ratio": 0.7566533857181956, "grad/layer_20/attn": 0.003958974964916706, "grad/layer_20/mlp": 0.0076185306534171104, "grad/layer_20/attn_mlp_ratio": 0.5196507165296934, "grad/layer_24/attn": 0.013243752531707287, "grad/layer_24/mlp": 0.012313955463469028, "grad/layer_24/attn_mlp_ratio": 1.075507578653006, "grad/layer_27/attn": 0.004178271628916264, "grad/layer_27/mlp": 0.013029636815190315, "grad/layer_27/attn_mlp_ratio": 0.32067444826840247} {"step": 550, "timestamp": 1778326319.1319506, "train/loss": 2.443239164352417, "train/z_loss": 0.0013617065153084696, "train/perplexity": 11.510264061059367, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023771.5225245506, "perf/iters_per_sec": 0.9650094616530183, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0362592697143556, "data/tokens_consumed": 1155530752, "data/tokens_consumed_B": 1.155530752, "train/loss_slope": -0.00038671464082401074} {"step": 560, "timestamp": 1778326329.4951992, "train/loss": 2.510306978225708, "train/z_loss": 0.0013411946129053831, "train/perplexity": 12.308707985948603, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024454.1677740132, "perf/iters_per_sec": 0.9653349722738329, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0359098434448242, "data/tokens_consumed": 1176502272, "data/tokens_consumed_B": 1.176502272, "train/loss_slope": -0.0003728078173206418} {"step": 570, "timestamp": 1778326339.850614, "train/loss": 2.423031210899353, "train/z_loss": 0.0013526983326300978, "train/perplexity": 11.279999603276092, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026123.3601255906, "perf/iters_per_sec": 0.9661309052112534, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0350564241409301, "data/tokens_consumed": 1197473792, "data/tokens_consumed_B": 1.197473792, "train/loss_slope": -0.00037493978608204236} {"step": 580, "timestamp": 1778326350.2071557, "train/loss": 2.4640973806381226, "train/z_loss": 0.00135516244918108, "train/perplexity": 11.752868995767598, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026053.12353878, "perf/iters_per_sec": 0.9660974137967968, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035092306137085, "data/tokens_consumed": 1218445312, "data/tokens_consumed_B": 1.218445312, "train/loss_slope": -0.0003692663125584934} {"step": 590, "timestamp": 1778326360.5788283, "train/loss": 2.505229663848877, "train/z_loss": 0.0013421184732578695, "train/perplexity": 12.246371191598007, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022952.9649799406, "perf/iters_per_sec": 0.9646191429996207, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0366785764694213, "data/tokens_consumed": 1239416832, "data/tokens_consumed_B": 1.239416832, "train/loss_slope": -0.0003567932329763468} {"step": 600, "timestamp": 1778326370.9255476, "grad/layer_0/attn": 0.00419065123423934, "grad/layer_0/mlp": 0.0038225557655096054, "grad/layer_0/attn_mlp_ratio": 1.0962956152063095, "grad/layer_4/attn": 0.002674761461094022, "grad/layer_4/mlp": 0.002750594401732087, "grad/layer_4/attn_mlp_ratio": 0.9724303089421896, "grad/layer_8/attn": 0.0040484340861439705, "grad/layer_8/mlp": 0.0036996162962168455, "grad/layer_8/attn_mlp_ratio": 1.0942848264711496, "grad/layer_12/attn": 0.005241916514933109, "grad/layer_12/mlp": 0.008008847944438457, "grad/layer_12/attn_mlp_ratio": 0.6545156664038877, "grad/layer_16/attn": 0.007050578482449055, "grad/layer_16/mlp": 0.005346533842384815, "grad/layer_16/attn_mlp_ratio": 1.318719484141936, "grad/layer_20/attn": 0.0032304669730365276, "grad/layer_20/mlp": 0.00680071534588933, "grad/layer_20/attn_mlp_ratio": 0.4750186945388478, "grad/layer_24/attn": 0.014725890010595322, "grad/layer_24/mlp": 0.011747600510716438, "grad/layer_24/attn_mlp_ratio": 1.253523208574355, "grad/layer_27/attn": 0.006761900149285793, "grad/layer_27/mlp": 0.012610850855708122, "grad/layer_27/attn_mlp_ratio": 0.5361969761624307} {"step": 600, "timestamp": 1778326371.5324805, "eos/sharpness": 74.53238964080809, "eos/L0_probe": 2.4692792892456055, "eos/L_plus": 2.773848056793213, "eos/L_minus": 2.910034418106079, "eos/grad_norm": 0.22960780560970306, "eos/embed_grad_frac": 0.04848777502775192, "eos/time_s": 0.6042196750640869} {"step": 600, "timestamp": 1778326371.5526452, "train/loss": 2.447311210632324, "train/z_loss": 0.001350158080458641, "train/perplexity": 11.557229947744737, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911911.8788806635, "perf/iters_per_sec": 0.9116706270602529, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0968873739242553, "data/tokens_consumed": 1260388352, "data/tokens_consumed_B": 1.260388352, "train/loss_slope": -0.00035412994217834783} {"step": 600, "timestamp": 1778326372.9141057, "geo/rankme_last": 420.3772277832031, "geo/layer_0/stable_rank_q_proj": 19.627504348754883, "geo/layer_0/stable_rank_k_proj": 16.501108169555664, "geo/layer_0/stable_rank_o_proj": 46.57045364379883, "geo/layer_0/stable_rank_gate_proj": 128.64573669433594, "geo/layer_0/stable_rank_down_proj": 55.687870025634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06746655702590942, "geo/layer_0/attn_entropy_mean": 6.228153228759766, "geo/layer_0/attn_entropy_std": 0.39282217621803284, "geo/layer_7/stable_rank_q_proj": 42.734962463378906, "geo/layer_7/stable_rank_k_proj": 40.10500717163086, "geo/layer_7/stable_rank_o_proj": 89.44473266601562, "geo/layer_7/stable_rank_gate_proj": 78.80017852783203, "geo/layer_7/stable_rank_down_proj": 141.0663299560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43922168016433716, "geo/layer_7/attn_entropy_mean": 4.749107837677002, "geo/layer_7/attn_entropy_std": 0.7915140986442566, "geo/layer_14/stable_rank_q_proj": 50.350711822509766, "geo/layer_14/stable_rank_k_proj": 40.967838287353516, "geo/layer_14/stable_rank_o_proj": 43.323524475097656, "geo/layer_14/stable_rank_gate_proj": 71.0146255493164, "geo/layer_14/stable_rank_down_proj": 126.31158447265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3988000452518463, "geo/layer_14/attn_entropy_mean": 5.543388843536377, "geo/layer_14/attn_entropy_std": 0.398812860250473, "geo/layer_21/stable_rank_q_proj": 40.082069396972656, "geo/layer_21/stable_rank_k_proj": 30.235071182250977, "geo/layer_21/stable_rank_o_proj": 68.50204467773438, "geo/layer_21/stable_rank_gate_proj": 64.09891510009766, "geo/layer_21/stable_rank_down_proj": 49.804744720458984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1438007950782776, "geo/layer_21/attn_entropy_mean": 5.903079986572266, "geo/layer_21/attn_entropy_std": 0.3113243877887726, "geo/layer_27/stable_rank_q_proj": 43.60525894165039, "geo/layer_27/stable_rank_k_proj": 31.893571853637695, "geo/layer_27/stable_rank_o_proj": 115.54096221923828, "geo/layer_27/stable_rank_gate_proj": 77.21105194091797, "geo/layer_27/stable_rank_down_proj": 127.51011657714844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11514691263437271, "geo/layer_27/attn_entropy_mean": 4.375103950500488, "geo/layer_27/attn_entropy_std": 0.6484338641166687, "attnres/final_alpha/block_0": 0.2471160739660263, "attnres/block_norm/0": 1.7869374752044678, "attnres/final_alpha/block_1": 0.004080012906342745, "attnres/block_norm/1": 48353.4609375, "attnres/final_alpha/block_2": 0.009273790754377842, "attnres/block_norm/2": 29376.81640625, "attnres/final_alpha/block_3": 0.010971873067319393, "attnres/block_norm/3": 65310.703125, "attnres/final_alpha/block_4": 0.01257343403995037, "attnres/block_norm/4": 16132.470703125, "attnres/final_alpha/block_5": 0.6125509142875671, "attnres/block_norm/5": 6742.05859375, "attnres/final_alpha/block_6": 0.10343389958143234, "attnres/block_norm/6": 43830.5, "geo/tier1_time_s": 1.3570878505706787, "geo/step": 600.0, "geo/rankme_slope": 0.005573411729600694} {"step": 610, "timestamp": 1778326383.2801845, "train/loss": 2.4428622007369993, "train/z_loss": 0.0013415503432042896, "train/perplexity": 11.505925928015024, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788877.8776551036, "perf/iters_per_sec": 0.8530034435534971, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1723282098770142, "data/tokens_consumed": 1281359872, "data/tokens_consumed_B": 1.281359872, "train/loss_slope": -0.000351855640298985} {"step": 620, "timestamp": 1778326393.64263, "train/loss": 2.4892263889312742, "train/z_loss": 0.0013442404568195343, "train/perplexity": 12.051948991985062, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025230.5746922395, "perf/iters_per_sec": 0.9657051919423292, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0355127096176147, "data/tokens_consumed": 1302331392, "data/tokens_consumed_B": 1.302331392, "train/loss_slope": -0.0003423681423731846} {"step": 630, "timestamp": 1778326404.0034046, "train/loss": 2.4691819667816164, "train/z_loss": 0.001355540449731052, "train/perplexity": 11.812779651983456, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025260.9774691178, "perf/iters_per_sec": 0.9657196891160573, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0354971647262574, "data/tokens_consumed": 1323302912, "data/tokens_consumed_B": 1.323302912, "train/loss_slope": -0.00033613995181553524} {"step": 640, "timestamp": 1778326414.3572488, "train/loss": 2.4456783056259157, "train/z_loss": 0.0013516121194697916, "train/perplexity": 11.538373488695916, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026637.3787810176, "perf/iters_per_sec": 0.9663760084061707, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347939014434815, "data/tokens_consumed": 1344274432, "data/tokens_consumed_B": 1.344274432, "train/loss_slope": -0.0003332863491731923} {"step": 650, "timestamp": 1778326424.7026846, "grad/layer_0/attn": 0.003315932583063841, "grad/layer_0/mlp": 0.0036159674637019634, "grad/layer_0/attn_mlp_ratio": 0.9170249800772682, "grad/layer_4/attn": 0.004933467600494623, "grad/layer_4/mlp": 0.0030089393258094788, "grad/layer_4/attn_mlp_ratio": 1.6396034955630252, "grad/layer_8/attn": 0.0040013957768678665, "grad/layer_8/mlp": 0.003993976395577192, "grad/layer_8/attn_mlp_ratio": 1.0018576176647236, "grad/layer_12/attn": 0.007993976585566998, "grad/layer_12/mlp": 0.0084974505007267, "grad/layer_12/attn_mlp_ratio": 0.9407499921074393, "grad/layer_16/attn": 0.008302020840346813, "grad/layer_16/mlp": 0.004895297344774008, "grad/layer_16/attn_mlp_ratio": 1.6959175482196012, "grad/layer_20/attn": 0.003732319688424468, "grad/layer_20/mlp": 0.00849398598074913, "grad/layer_20/attn_mlp_ratio": 0.4394073233629899, "grad/layer_24/attn": 0.022247252985835075, "grad/layer_24/mlp": 0.017353180795907974, "grad/layer_24/attn_mlp_ratio": 1.2820273769566457, "grad/layer_27/attn": 0.01383116189390421, "grad/layer_27/mlp": 0.01746137998998165, "grad/layer_27/attn_mlp_ratio": 0.7921001560374797} {"step": 650, "timestamp": 1778326424.7184153, "train/loss": 2.455442690849304, "train/z_loss": 0.0013406159589067102, "train/perplexity": 11.651590459813008, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025389.3599952438, "perf/iters_per_sec": 0.9657809066749782, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0354315280914306, "data/tokens_consumed": 1365245952, "data/tokens_consumed_B": 1.365245952, "train/loss_slope": -0.0003289073181729664} {"step": 660, "timestamp": 1778326435.072183, "train/loss": 2.4705378293991087, "train/z_loss": 0.0013542568427510559, "train/perplexity": 11.828807021322381, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026709.8039639823, "perf/iters_per_sec": 0.9664105434245979, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347569227218627, "data/tokens_consumed": 1386217472, "data/tokens_consumed_B": 1.386217472, "train/loss_slope": -0.00032248780632049485} {"step": 670, "timestamp": 1778326445.426953, "train/loss": 2.449769139289856, "train/z_loss": 0.0013509287033230065, "train/perplexity": 11.585671734056762, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026340.4943089713, "perf/iters_per_sec": 0.9662344428582055, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034945511817932, "data/tokens_consumed": 1407188992, "data/tokens_consumed_B": 1.407188992, "train/loss_slope": -0.00031886175802137417} {"step": 675, "timestamp": 1778326451.1993725, "eos/sharpness": 20.129013061523434, "eos/L0_probe": 2.4623239040374756, "eos/L_plus": 2.5758321285247803, "eos/L_minus": 2.5501058101654053, "eos/grad_norm": 0.12672019004821777, "eos/embed_grad_frac": 0.17677460610866547, "eos/time_s": 0.6030688285827637} {"step": 675, "timestamp": 1778326452.577969, "geo/rankme_last": 421.2183532714844, "geo/layer_0/stable_rank_q_proj": 19.699312210083008, "geo/layer_0/stable_rank_k_proj": 16.565141677856445, "geo/layer_0/stable_rank_o_proj": 46.56351852416992, "geo/layer_0/stable_rank_gate_proj": 128.95571899414062, "geo/layer_0/stable_rank_down_proj": 55.71894454956055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06867729127407074, "geo/layer_0/attn_entropy_mean": 6.234999656677246, "geo/layer_0/attn_entropy_std": 0.3971279263496399, "geo/layer_7/stable_rank_q_proj": 42.68943786621094, "geo/layer_7/stable_rank_k_proj": 40.07729721069336, "geo/layer_7/stable_rank_o_proj": 89.47225189208984, "geo/layer_7/stable_rank_gate_proj": 78.67652893066406, "geo/layer_7/stable_rank_down_proj": 141.19061279296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42585983872413635, "geo/layer_7/attn_entropy_mean": 4.730537414550781, "geo/layer_7/attn_entropy_std": 0.7897849082946777, "geo/layer_14/stable_rank_q_proj": 50.45770263671875, "geo/layer_14/stable_rank_k_proj": 40.856319427490234, "geo/layer_14/stable_rank_o_proj": 43.361106872558594, "geo/layer_14/stable_rank_gate_proj": 70.92948913574219, "geo/layer_14/stable_rank_down_proj": 126.42235565185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3698095679283142, "geo/layer_14/attn_entropy_mean": 5.553811073303223, "geo/layer_14/attn_entropy_std": 0.4114702343940735, "geo/layer_21/stable_rank_q_proj": 39.99714660644531, "geo/layer_21/stable_rank_k_proj": 30.28509521484375, "geo/layer_21/stable_rank_o_proj": 68.39248657226562, "geo/layer_21/stable_rank_gate_proj": 64.14675903320312, "geo/layer_21/stable_rank_down_proj": 49.82765197753906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14148032665252686, "geo/layer_21/attn_entropy_mean": 5.925442695617676, "geo/layer_21/attn_entropy_std": 0.29547247290611267, "geo/layer_27/stable_rank_q_proj": 43.66623306274414, "geo/layer_27/stable_rank_k_proj": 31.968114852905273, "geo/layer_27/stable_rank_o_proj": 115.45952606201172, "geo/layer_27/stable_rank_gate_proj": 77.14573669433594, "geo/layer_27/stable_rank_down_proj": 127.44532012939453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11139649897813797, "geo/layer_27/attn_entropy_mean": 4.371003150939941, "geo/layer_27/attn_entropy_std": 0.6776920557022095, "attnres/final_alpha/block_0": 0.24746820330619812, "attnres/block_norm/0": 1.7865971326828003, "attnres/final_alpha/block_1": 0.004078926518559456, "attnres/block_norm/1": 48353.4609375, "attnres/final_alpha/block_2": 0.009245982393622398, "attnres/block_norm/2": 29330.76953125, "attnres/final_alpha/block_3": 0.010978635400533676, "attnres/block_norm/3": 65549.8828125, "attnres/final_alpha/block_4": 0.012657839804887772, "attnres/block_norm/4": 16153.796875, "attnres/final_alpha/block_5": 0.6133018136024475, "attnres/block_norm/5": 6798.1708984375, "attnres/final_alpha/block_6": 0.10226857662200928, "attnres/block_norm/6": 43624.2734375, "geo/tier1_time_s": 1.3584089279174805, "geo/step": 675.0, "geo/rankme_slope": 0.005641359887941919} {"step": 680, "timestamp": 1778326457.7605112, "train/loss": 2.478080320358276, "train/z_loss": 0.0013481031637638807, "train/perplexity": 11.918363004371294, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701308.9074315932, "perf/iters_per_sec": 0.8112473046453443, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.232669734954834, "data/tokens_consumed": 1428160512, "data/tokens_consumed_B": 1.428160512, "train/loss_slope": -0.00031162899098868433} {"step": 690, "timestamp": 1778326468.1131725, "train/loss": 2.4795692682266237, "train/z_loss": 0.0013381216675043106, "train/perplexity": 11.936122043421582, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026610.2031620683, "perf/iters_per_sec": 0.9663630500612584, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0348077774047852, "data/tokens_consumed": 1449132032, "data/tokens_consumed_B": 1.449132032, "train/loss_slope": -0.00030444274767522846} {"step": 700, "timestamp": 1778326478.4565573, "grad/layer_0/attn": 0.003089447505772114, "grad/layer_0/mlp": 0.0034071507398039103, "grad/layer_0/attn_mlp_ratio": 0.9067539569072671, "grad/layer_4/attn": 0.004065386485308409, "grad/layer_4/mlp": 0.0026626172475516796, "grad/layer_4/attn_mlp_ratio": 1.5268384280026563, "grad/layer_8/attn": 0.003802996827289462, "grad/layer_8/mlp": 0.003661808790639043, "grad/layer_8/attn_mlp_ratio": 1.0385568829141654, "grad/layer_12/attn": 0.005699626170098782, "grad/layer_12/mlp": 0.008235108107328415, "grad/layer_12/attn_mlp_ratio": 0.6921130878434226, "grad/layer_16/attn": 0.005489892326295376, "grad/layer_16/mlp": 0.005123901180922985, "grad/layer_16/attn_mlp_ratio": 1.0714281999801651, "grad/layer_20/attn": 0.004618971608579159, "grad/layer_20/mlp": 0.008424915373325348, "grad/layer_20/attn_mlp_ratio": 0.5482513887769644, "grad/layer_24/attn": 0.022762004286050797, "grad/layer_24/mlp": 0.014760219492018223, "grad/layer_24/attn_mlp_ratio": 1.5421182689151618, "grad/layer_27/attn": 0.004583713132888079, "grad/layer_27/mlp": 0.015612831339240074, "grad/layer_27/attn_mlp_ratio": 0.29358628194548564} {"step": 700, "timestamp": 1778326478.4725027, "train/loss": 2.4598336696624754, "train/z_loss": 0.001341442100238055, "train/perplexity": 11.702864836629148, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025554.560440426, "perf/iters_per_sec": 0.9658596803857927, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035347080230713, "data/tokens_consumed": 1470103552, "data/tokens_consumed_B": 1.470103552, "train/loss_slope": -0.0002998059810527134} {"step": 710, "timestamp": 1778326488.8279593, "train/loss": 2.44762761592865, "train/z_loss": 0.001342398370616138, "train/perplexity": 11.560887295082583, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026643.0754808274, "perf/iters_per_sec": 0.9663787248043191, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347909927368164, "data/tokens_consumed": 1491075072, "data/tokens_consumed_B": 1.491075072, "train/loss_slope": -0.0002965961662823996} {"step": 720, "timestamp": 1778326499.1864555, "train/loss": 2.437295746803284, "train/z_loss": 0.0013602304155938328, "train/perplexity": 11.442056648989618, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026172.5986429946, "perf/iters_per_sec": 0.9661543839659665, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035031270980835, "data/tokens_consumed": 1512046592, "data/tokens_consumed_B": 1.512046592, "train/loss_slope": -0.0002944611727742667} {"step": 730, "timestamp": 1778326509.5421515, "train/loss": 2.4327359437942504, "train/z_loss": 0.0013552054530009626, "train/perplexity": 11.390001894557626, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025965.1596659478, "perf/iters_per_sec": 0.9660554693536509, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351372480392456, "data/tokens_consumed": 1533018112, "data/tokens_consumed_B": 1.533018112, "train/loss_slope": -0.0002926692859193476} {"step": 740, "timestamp": 1778326519.896202, "train/loss": 2.4434028387069704, "train/z_loss": 0.001345349568873644, "train/perplexity": 11.512148150284636, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026701.0715752929, "perf/iters_per_sec": 0.9664063794971909, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034761381149292, "data/tokens_consumed": 1553989632, "data/tokens_consumed_B": 1.553989632, "train/loss_slope": -0.00028958607364345233} {"step": 750, "timestamp": 1778326530.2587044, "grad/layer_0/attn": 0.003290612483397126, "grad/layer_0/mlp": 0.0034236256033182144, "grad/layer_0/attn_mlp_ratio": 0.9611484340148018, "grad/layer_4/attn": 0.0025601147208362818, "grad/layer_4/mlp": 0.0026670668739825487, "grad/layer_4/attn_mlp_ratio": 0.9598989248528085, "grad/layer_8/attn": 0.004153227899223566, "grad/layer_8/mlp": 0.003437628736719489, "grad/layer_8/attn_mlp_ratio": 1.2081664706964028, "grad/layer_12/attn": 0.0057103075087070465, "grad/layer_12/mlp": 0.007137675303965807, "grad/layer_12/attn_mlp_ratio": 0.8000234229668539, "grad/layer_16/attn": 0.005016149487346411, "grad/layer_16/mlp": 0.00529941963031888, "grad/layer_16/attn_mlp_ratio": 0.9465469320439308, "grad/layer_20/attn": 0.0037048293743282557, "grad/layer_20/mlp": 0.006877701263874769, "grad/layer_20/attn_mlp_ratio": 0.5386726143400073, "grad/layer_24/attn": 0.011443471536040306, "grad/layer_24/mlp": 0.010944344103336334, "grad/layer_24/attn_mlp_ratio": 1.045605960798621, "grad/layer_27/attn": 0.006350046489387751, "grad/layer_27/mlp": 0.01087683904916048, "grad/layer_27/attn_mlp_ratio": 0.5838135879648338} {"step": 750, "timestamp": 1778326530.8585773, "eos/sharpness": 72.33078479766844, "eos/L0_probe": 2.4535646438598633, "eos/L_plus": 2.7487261295318604, "eos/L_minus": 2.881711006164551, "eos/grad_norm": 0.2079015076160431, "eos/embed_grad_frac": 0.06631363183259964, "eos/time_s": 0.5971050262451172} {"step": 750, "timestamp": 1778326530.8788877, "train/loss": 2.4031424522399902, "train/z_loss": 0.0013582292362116278, "train/perplexity": 11.057870669973848, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910800.331038529, "perf/iters_per_sec": 0.9111405997460027, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.097525453567505, "data/tokens_consumed": 1574961152, "data/tokens_consumed_B": 1.574961152, "train/loss_slope": -0.00029057089980953936} {"step": 750, "timestamp": 1778326532.2384982, "geo/rankme_last": 419.91986083984375, "geo/layer_0/stable_rank_q_proj": 19.719446182250977, "geo/layer_0/stable_rank_k_proj": 16.623926162719727, "geo/layer_0/stable_rank_o_proj": 46.46363830566406, "geo/layer_0/stable_rank_gate_proj": 129.14227294921875, "geo/layer_0/stable_rank_down_proj": 55.646846771240234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06487593054771423, "geo/layer_0/attn_entropy_mean": 6.235340118408203, "geo/layer_0/attn_entropy_std": 0.3967558443546295, "geo/layer_7/stable_rank_q_proj": 42.63737106323242, "geo/layer_7/stable_rank_k_proj": 40.01559829711914, "geo/layer_7/stable_rank_o_proj": 89.43448638916016, "geo/layer_7/stable_rank_gate_proj": 78.55290985107422, "geo/layer_7/stable_rank_down_proj": 141.59999084472656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4453963339328766, "geo/layer_7/attn_entropy_mean": 4.702439308166504, "geo/layer_7/attn_entropy_std": 0.7972783446311951, "geo/layer_14/stable_rank_q_proj": 50.50165557861328, "geo/layer_14/stable_rank_k_proj": 40.8250617980957, "geo/layer_14/stable_rank_o_proj": 43.3333625793457, "geo/layer_14/stable_rank_gate_proj": 70.92469024658203, "geo/layer_14/stable_rank_down_proj": 126.5467300415039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38034844398498535, "geo/layer_14/attn_entropy_mean": 5.529674053192139, "geo/layer_14/attn_entropy_std": 0.3998539447784424, "geo/layer_21/stable_rank_q_proj": 40.04221725463867, "geo/layer_21/stable_rank_k_proj": 30.185178756713867, "geo/layer_21/stable_rank_o_proj": 68.25662994384766, "geo/layer_21/stable_rank_gate_proj": 64.1625747680664, "geo/layer_21/stable_rank_down_proj": 49.801544189453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14439837634563446, "geo/layer_21/attn_entropy_mean": 5.892936706542969, "geo/layer_21/attn_entropy_std": 0.3114643096923828, "geo/layer_27/stable_rank_q_proj": 43.69160461425781, "geo/layer_27/stable_rank_k_proj": 31.893531799316406, "geo/layer_27/stable_rank_o_proj": 115.47459411621094, "geo/layer_27/stable_rank_gate_proj": 77.05776977539062, "geo/layer_27/stable_rank_down_proj": 127.3933334350586, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10581568628549576, "geo/layer_27/attn_entropy_mean": 4.375255584716797, "geo/layer_27/attn_entropy_std": 0.669589102268219, "attnres/final_alpha/block_0": 0.24758422374725342, "attnres/block_norm/0": 1.7862188816070557, "attnres/final_alpha/block_1": 0.004092397168278694, "attnres/block_norm/1": 48577.0625, "attnres/final_alpha/block_2": 0.00916825607419014, "attnres/block_norm/2": 29527.2890625, "attnres/final_alpha/block_3": 0.01106323953717947, "attnres/block_norm/3": 65279.9375, "attnres/final_alpha/block_4": 0.012477042153477669, "attnres/block_norm/4": 16193.978515625, "attnres/final_alpha/block_5": 0.6112411618232727, "attnres/block_norm/5": 6773.61669921875, "attnres/final_alpha/block_6": 0.10437369346618652, "attnres/block_norm/6": 43987.921875, "geo/tier1_time_s": 1.3561005592346191, "geo/step": 750.0, "geo/rankme_slope": 0.004635031960227273} {"step": 760, "timestamp": 1778326542.5856762, "train/loss": 2.467048001289368, "train/z_loss": 0.0013487614109180869, "train/perplexity": 11.787598465286178, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791914.8139323122, "perf/iters_per_sec": 0.8544515676175652, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1703413486480714, "data/tokens_consumed": 1595932672, "data/tokens_consumed_B": 1.595932672, "train/loss_slope": -0.0002848065026056265} {"step": 770, "timestamp": 1778326552.9319782, "train/loss": 2.393497347831726, "train/z_loss": 0.001361441402696073, "train/perplexity": 10.95172904918047, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028019.112288981, "perf/iters_per_sec": 0.967034870285502, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0340888738632201, "data/tokens_consumed": 1616904192, "data/tokens_consumed_B": 1.616904192, "train/loss_slope": -0.0002863586754385781} {"step": 780, "timestamp": 1778326563.2948604, "train/loss": 2.451009750366211, "train/z_loss": 0.0013565083499997854, "train/perplexity": 11.600053966270679, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024861.2911498751, "perf/iters_per_sec": 0.965529103827417, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0357015609741211, "data/tokens_consumed": 1637875712, "data/tokens_consumed_B": 1.637875712, "train/loss_slope": -0.000282064011173564} {"step": 790, "timestamp": 1778326573.6555786, "train/loss": 2.424272298812866, "train/z_loss": 0.001351524400524795, "train/perplexity": 11.294007765330397, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025125.990515346, "perf/iters_per_sec": 0.9656553223206262, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0355661869049073, "data/tokens_consumed": 1658847232, "data/tokens_consumed_B": 1.658847232, "train/loss_slope": -0.00028029801322438036} {"step": 800, "timestamp": 1778326583.9986567, "grad/layer_0/attn": 0.00384888076223433, "grad/layer_0/mlp": 0.0037389788776636124, "grad/layer_0/attn_mlp_ratio": 1.0293935283475149, "grad/layer_4/attn": 0.002377635333687067, "grad/layer_4/mlp": 0.0026156646199524403, "grad/layer_4/attn_mlp_ratio": 0.9089985102258438, "grad/layer_8/attn": 0.004403732251375914, "grad/layer_8/mlp": 0.0036326623521745205, "grad/layer_8/attn_mlp_ratio": 1.2122602386962291, "grad/layer_12/attn": 0.005612327251583338, "grad/layer_12/mlp": 0.007067425642162561, "grad/layer_12/attn_mlp_ratio": 0.7941119519801311, "grad/layer_16/attn": 0.0033274663146585226, "grad/layer_16/mlp": 0.00474679097533226, "grad/layer_16/attn_mlp_ratio": 0.7009927889917951, "grad/layer_20/attn": 0.0035185515880584717, "grad/layer_20/mlp": 0.0068166060373187065, "grad/layer_20/attn_mlp_ratio": 0.5161735205435362, "grad/layer_24/attn": 0.01374129019677639, "grad/layer_24/mlp": 0.013698210008442402, "grad/layer_24/attn_mlp_ratio": 1.003144942878883, "grad/layer_27/attn": 0.005968577694147825, "grad/layer_27/mlp": 0.013820121064782143, "grad/layer_27/attn_mlp_ratio": 0.43187593096922844} {"step": 800, "timestamp": 1778326584.0143719, "train/loss": 2.4461058378219604, "train/z_loss": 0.0013656510855071247, "train/perplexity": 11.543307569516372, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026033.1501695276, "perf/iters_per_sec": 0.9660878897521628, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351025104522704, "data/tokens_consumed": 1679818752, "data/tokens_consumed_B": 1.679818752, "train/loss_slope": -0.00027643411818558604} {"step": 810, "timestamp": 1778326594.379412, "train/loss": 2.4376547574996947, "train/z_loss": 0.0013542002416215837, "train/perplexity": 11.446165207179584, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024584.917804701, "perf/iters_per_sec": 0.965397318746901, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0358429431915284, "data/tokens_consumed": 1700790272, "data/tokens_consumed_B": 1.700790272, "train/loss_slope": -0.0002733474632985842} {"step": 820, "timestamp": 1778326604.7248032, "train/loss": 2.4829960107803344, "train/z_loss": 0.0013503830879926682, "train/perplexity": 11.977094221213441, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028131.1969827188, "perf/iters_per_sec": 0.9670883164323419, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0340317249298097, "data/tokens_consumed": 1721761792, "data/tokens_consumed_B": 1.721761792, "train/loss_slope": -0.0002663413875074271} {"step": 825, "timestamp": 1778326610.4784484, "eos/sharpness": 63.26167583465575, "eos/L0_probe": 2.4502553939819336, "eos/L_plus": 2.704132556915283, "eos/L_minus": 2.8289949893951416, "eos/grad_norm": 0.15028046071529388, "eos/embed_grad_frac": 0.10345155745744705, "eos/time_s": 0.5910854339599609} {"step": 825, "timestamp": 1778326611.855314, "geo/rankme_last": 420.2944641113281, "geo/layer_0/stable_rank_q_proj": 19.749164581298828, "geo/layer_0/stable_rank_k_proj": 16.682043075561523, "geo/layer_0/stable_rank_o_proj": 46.45442199707031, "geo/layer_0/stable_rank_gate_proj": 129.28103637695312, "geo/layer_0/stable_rank_down_proj": 55.613040924072266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06372464448213577, "geo/layer_0/attn_entropy_mean": 6.241339683532715, "geo/layer_0/attn_entropy_std": 0.39852508902549744, "geo/layer_7/stable_rank_q_proj": 42.608333587646484, "geo/layer_7/stable_rank_k_proj": 39.96060562133789, "geo/layer_7/stable_rank_o_proj": 89.39630126953125, "geo/layer_7/stable_rank_gate_proj": 78.70223999023438, "geo/layer_7/stable_rank_down_proj": 141.692626953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43803849816322327, "geo/layer_7/attn_entropy_mean": 4.746108531951904, "geo/layer_7/attn_entropy_std": 0.7927394509315491, "geo/layer_14/stable_rank_q_proj": 50.685543060302734, "geo/layer_14/stable_rank_k_proj": 40.78435516357422, "geo/layer_14/stable_rank_o_proj": 43.282470703125, "geo/layer_14/stable_rank_gate_proj": 71.08457946777344, "geo/layer_14/stable_rank_down_proj": 126.38877868652344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38643649220466614, "geo/layer_14/attn_entropy_mean": 5.546440601348877, "geo/layer_14/attn_entropy_std": 0.4084266126155853, "geo/layer_21/stable_rank_q_proj": 39.93983459472656, "geo/layer_21/stable_rank_k_proj": 30.104555130004883, "geo/layer_21/stable_rank_o_proj": 68.09542083740234, "geo/layer_21/stable_rank_gate_proj": 64.15005493164062, "geo/layer_21/stable_rank_down_proj": 49.842166900634766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14529451727867126, "geo/layer_21/attn_entropy_mean": 5.902417182922363, "geo/layer_21/attn_entropy_std": 0.3033856451511383, "geo/layer_27/stable_rank_q_proj": 43.69590377807617, "geo/layer_27/stable_rank_k_proj": 31.793804168701172, "geo/layer_27/stable_rank_o_proj": 115.3440933227539, "geo/layer_27/stable_rank_gate_proj": 76.95362854003906, "geo/layer_27/stable_rank_down_proj": 127.06639099121094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11185191571712494, "geo/layer_27/attn_entropy_mean": 4.348356246948242, "geo/layer_27/attn_entropy_std": 0.6629512906074524, "attnres/final_alpha/block_0": 0.2482614815235138, "attnres/block_norm/0": 1.7860541343688965, "attnres/final_alpha/block_1": 0.004167323932051659, "attnres/block_norm/1": 48615.4140625, "attnres/final_alpha/block_2": 0.00934021919965744, "attnres/block_norm/2": 29490.623046875, "attnres/final_alpha/block_3": 0.011059393174946308, "attnres/block_norm/3": 65393.35546875, "attnres/final_alpha/block_4": 0.012710223905742168, "attnres/block_norm/4": 16216.427734375, "attnres/final_alpha/block_5": 0.6092650890350342, "attnres/block_norm/5": 6817.2236328125, "attnres/final_alpha/block_6": 0.10519625246524811, "attnres/block_norm/6": 44187.84765625, "geo/tier1_time_s": 1.3559362888336182, "geo/step": 825.0, "geo/rankme_slope": 0.004068291937554633} {"step": 830, "timestamp": 1778326617.030782, "train/loss": 2.444815182685852, "train/z_loss": 0.0013507927069440484, "train/perplexity": 11.528418750546237, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704932.364503164, "perf/iters_per_sec": 0.8129751036182232, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.230049967765808, "data/tokens_consumed": 1742733312, "data/tokens_consumed_B": 1.742733312, "train/loss_slope": -0.00026280862744114977} {"step": 840, "timestamp": 1778326627.3815644, "train/loss": 2.4498385190963745, "train/z_loss": 0.0013480014051310718, "train/perplexity": 11.586475573604853, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026992.3163165948, "perf/iters_per_sec": 0.9665452558119749, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0346127033233643, "data/tokens_consumed": 1763704832, "data/tokens_consumed_B": 1.763704832, "train/loss_slope": -0.0002588914155913489} {"step": 850, "timestamp": 1778326637.7243881, "grad/layer_0/attn": 0.003255734220147133, "grad/layer_0/mlp": 0.0034151370637118816, "grad/layer_0/attn_mlp_ratio": 0.9533245852440388, "grad/layer_4/attn": 0.0034293667413294315, "grad/layer_4/mlp": 0.0027331518940627575, "grad/layer_4/attn_mlp_ratio": 1.2547296121031928, "grad/layer_8/attn": 0.0037639006040990353, "grad/layer_8/mlp": 0.0036789390724152327, "grad/layer_8/attn_mlp_ratio": 1.0230940028366995, "grad/layer_12/attn": 0.005661708302795887, "grad/layer_12/mlp": 0.007491514086723328, "grad/layer_12/attn_mlp_ratio": 0.7557495269554085, "grad/layer_16/attn": 0.0043822601437568665, "grad/layer_16/mlp": 0.004637370351701975, "grad/layer_16/attn_mlp_ratio": 0.9449881542563676, "grad/layer_20/attn": 0.003738561412319541, "grad/layer_20/mlp": 0.006949735339730978, "grad/layer_20/attn_mlp_ratio": 0.5379429828287478, "grad/layer_24/attn": 0.01693803258240223, "grad/layer_24/mlp": 0.01237773522734642, "grad/layer_24/attn_mlp_ratio": 1.3684274331654707, "grad/layer_27/attn": 0.007898280397057533, "grad/layer_27/mlp": 0.013532313518226147, "grad/layer_27/attn_mlp_ratio": 0.5836607560158558} {"step": 850, "timestamp": 1778326637.7402155, "train/loss": 2.436000370979309, "train/z_loss": 0.001352744339965284, "train/perplexity": 11.427244481163251, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025499.2418753493, "perf/iters_per_sec": 0.9658333024384257, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0353753566741943, "data/tokens_consumed": 1784676352, "data/tokens_consumed_B": 1.784676352, "train/loss_slope": -0.0002561433187402235} {"step": 860, "timestamp": 1778326648.093763, "train/loss": 2.4694120407104494, "train/z_loss": 0.0013470707926899195, "train/perplexity": 11.815497777281319, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026682.6264026812, "perf/iters_per_sec": 0.9663975841535002, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347707986831665, "data/tokens_consumed": 1805647872, "data/tokens_consumed_B": 1.805647872, "train/loss_slope": -0.000250761229755354} {"step": 870, "timestamp": 1778326658.4416668, "train/loss": 2.426256799697876, "train/z_loss": 0.0013604674721136689, "train/perplexity": 11.31644298773241, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027676.6690760632, "perf/iters_per_sec": 0.9668715806370083, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034263515472412, "data/tokens_consumed": 1826619392, "data/tokens_consumed_B": 1.826619392, "train/loss_slope": -0.00024885182050479423} {"step": 880, "timestamp": 1778326668.7977643, "train/loss": 2.4330313920974733, "train/z_loss": 0.0013692478998564184, "train/perplexity": 11.393367548454963, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026181.466497596, "perf/iters_per_sec": 0.966158612488554, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035026741027832, "data/tokens_consumed": 1847590912, "data/tokens_consumed_B": 1.847590912, "train/loss_slope": -0.0002463744089722755} {"step": 890, "timestamp": 1778326679.1547725, "train/loss": 2.446798014640808, "train/z_loss": 0.0013565536006353795, "train/perplexity": 11.551300345316763, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026045.33014458, "perf/iters_per_sec": 0.9660936976168537, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035096287727356, "data/tokens_consumed": 1868562432, "data/tokens_consumed_B": 1.868562432, "train/loss_slope": -0.00024286949045300828} {"step": 900, "timestamp": 1778326689.49852, "grad/layer_0/attn": 0.003200834384188056, "grad/layer_0/mlp": 0.003612803528085351, "grad/layer_0/attn_mlp_ratio": 0.885969655063809, "grad/layer_4/attn": 0.002006004797294736, "grad/layer_4/mlp": 0.0027413961943238974, "grad/layer_4/attn_mlp_ratio": 0.7317456441625019, "grad/layer_8/attn": 0.00387232075445354, "grad/layer_8/mlp": 0.0037951034028083086, "grad/layer_8/attn_mlp_ratio": 1.0203465469619186, "grad/layer_12/attn": 0.006047749891877174, "grad/layer_12/mlp": 0.006896542850881815, "grad/layer_12/attn_mlp_ratio": 0.8769248498777042, "grad/layer_16/attn": 0.006070166826248169, "grad/layer_16/mlp": 0.0050786943174898624, "grad/layer_16/attn_mlp_ratio": 1.1952219069026682, "grad/layer_20/attn": 0.0041335648857057095, "grad/layer_20/mlp": 0.006038035731762648, "grad/layer_20/attn_mlp_ratio": 0.6845876707059922, "grad/layer_24/attn": 0.008616045117378235, "grad/layer_24/mlp": 0.009275552816689014, "grad/layer_24/attn_mlp_ratio": 0.9288982764440746, "grad/layer_27/attn": 0.0072505902498960495, "grad/layer_27/mlp": 0.00910655315965414, "grad/layer_27/attn_mlp_ratio": 0.7961947888691557} {"step": 900, "timestamp": 1778326690.1885571, "eos/sharpness": 16.596698760986325, "eos/L0_probe": 2.4454307556152344, "eos/L_plus": 2.5415260791778564, "eos/L_minus": 2.5153024196624756, "eos/grad_norm": 0.11163558065891266, "eos/embed_grad_frac": 0.23351293802261353, "eos/time_s": 0.6871185302734375} {"step": 900, "timestamp": 1778326690.2097619, "train/loss": 2.459168028831482, "train/z_loss": 0.0013540227548219264, "train/perplexity": 11.695077524018444, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1897764.8515159262, "perf/iters_per_sec": 0.9049247987346297, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1050642013549805, "data/tokens_consumed": 1889533952, "data/tokens_consumed_B": 1.889533952, "train/loss_slope": -0.0002385300557540089} {"step": 900, "timestamp": 1778326691.5704713, "geo/rankme_last": 422.18658447265625, "geo/layer_0/stable_rank_q_proj": 19.792207717895508, "geo/layer_0/stable_rank_k_proj": 16.747459411621094, "geo/layer_0/stable_rank_o_proj": 46.45131301879883, "geo/layer_0/stable_rank_gate_proj": 129.4764404296875, "geo/layer_0/stable_rank_down_proj": 55.60369110107422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06735267490148544, "geo/layer_0/attn_entropy_mean": 6.241482734680176, "geo/layer_0/attn_entropy_std": 0.3990233242511749, "geo/layer_7/stable_rank_q_proj": 42.545166015625, "geo/layer_7/stable_rank_k_proj": 39.892940521240234, "geo/layer_7/stable_rank_o_proj": 89.65718841552734, "geo/layer_7/stable_rank_gate_proj": 78.67375183105469, "geo/layer_7/stable_rank_down_proj": 141.75856018066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4320041537284851, "geo/layer_7/attn_entropy_mean": 4.733340740203857, "geo/layer_7/attn_entropy_std": 0.7857165932655334, "geo/layer_14/stable_rank_q_proj": 50.663631439208984, "geo/layer_14/stable_rank_k_proj": 40.85548400878906, "geo/layer_14/stable_rank_o_proj": 43.236610412597656, "geo/layer_14/stable_rank_gate_proj": 71.04544830322266, "geo/layer_14/stable_rank_down_proj": 126.30199432373047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3731350302696228, "geo/layer_14/attn_entropy_mean": 5.523374080657959, "geo/layer_14/attn_entropy_std": 0.4147581458091736, "geo/layer_21/stable_rank_q_proj": 39.919090270996094, "geo/layer_21/stable_rank_k_proj": 30.146677017211914, "geo/layer_21/stable_rank_o_proj": 67.97954559326172, "geo/layer_21/stable_rank_gate_proj": 64.04421997070312, "geo/layer_21/stable_rank_down_proj": 49.85552978515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14587295055389404, "geo/layer_21/attn_entropy_mean": 5.885900497436523, "geo/layer_21/attn_entropy_std": 0.3100665211677551, "geo/layer_27/stable_rank_q_proj": 43.771644592285156, "geo/layer_27/stable_rank_k_proj": 31.754474639892578, "geo/layer_27/stable_rank_o_proj": 115.29903411865234, "geo/layer_27/stable_rank_gate_proj": 76.76957702636719, "geo/layer_27/stable_rank_down_proj": 127.07861328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10439504683017731, "geo/layer_27/attn_entropy_mean": 4.3390889167785645, "geo/layer_27/attn_entropy_std": 0.6685945987701416, "attnres/final_alpha/block_0": 0.2465832531452179, "attnres/block_norm/0": 1.7856264114379883, "attnres/final_alpha/block_1": 0.004116006661206484, "attnres/block_norm/1": 48544.3203125, "attnres/final_alpha/block_2": 0.009299865923821926, "attnres/block_norm/2": 29350.1328125, "attnres/final_alpha/block_3": 0.01102966908365488, "attnres/block_norm/3": 65596.359375, "attnres/final_alpha/block_4": 0.01270242314785719, "attnres/block_norm/4": 16236.19921875, "attnres/final_alpha/block_5": 0.6130052804946899, "attnres/block_norm/5": 6762.150390625, "attnres/final_alpha/block_6": 0.1032634899020195, "attnres/block_norm/6": 43955.74609375, "geo/tier1_time_s": 1.3576123714447021, "geo/step": 900.0, "geo/rankme_slope": 0.004423338502317994} {"step": 910, "timestamp": 1778326701.9269438, "train/loss": 2.4188300132751466, "train/z_loss": 0.001338751777075231, "train/perplexity": 11.232709502826665, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790307.7278088555, "perf/iters_per_sec": 0.8536852492374685, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1713919162750244, "data/tokens_consumed": 1910505472, "data/tokens_consumed_B": 1.910505472, "train/loss_slope": -0.00023712902221734362} {"step": 920, "timestamp": 1778326712.2834315, "train/loss": 2.4333882570266723, "train/z_loss": 0.0013511330238543452, "train/perplexity": 11.39743416733215, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026063.997057217, "perf/iters_per_sec": 0.9661025986944279, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.035086750984192, "data/tokens_consumed": 1931476992, "data/tokens_consumed_B": 1.931476992, "train/loss_slope": -0.0002346545100863086} {"step": 930, "timestamp": 1778326722.6438732, "train/loss": 2.4144256353378295, "train/z_loss": 0.0013595210737548768, "train/perplexity": 11.183345194251535, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025223.160656565, "perf/iters_per_sec": 0.9657016566546274, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0355165004730225, "data/tokens_consumed": 1952448512, "data/tokens_consumed_B": 1.952448512, "train/loss_slope": -0.00023345104204507264} {"step": 940, "timestamp": 1778326733.0022292, "train/loss": 2.400724411010742, "train/z_loss": 0.0013530044234357774, "train/perplexity": 11.031164584006998, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026321.3088257017, "perf/iters_per_sec": 0.9662252945068844, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349553108215332, "data/tokens_consumed": 1973420032, "data/tokens_consumed_B": 1.973420032, "train/loss_slope": -0.00023306980912656021} {"step": 950, "timestamp": 1778326743.3429813, "grad/layer_0/attn": 0.0035141396801918745, "grad/layer_0/mlp": 0.0033573477994650602, "grad/layer_0/attn_mlp_ratio": 1.046701082348897, "grad/layer_4/attn": 0.0019523133523762226, "grad/layer_4/mlp": 0.002872284036129713, "grad/layer_4/attn_mlp_ratio": 0.6797075984992509, "grad/layer_8/attn": 0.003298460738733411, "grad/layer_8/mlp": 0.0037484210915863514, "grad/layer_8/attn_mlp_ratio": 0.8799600071990535, "grad/layer_12/attn": 0.006687368731945753, "grad/layer_12/mlp": 0.008548612706363201, "grad/layer_12/attn_mlp_ratio": 0.7822753098570541, "grad/layer_16/attn": 0.004996446892619133, "grad/layer_16/mlp": 0.006014689337462187, "grad/layer_16/attn_mlp_ratio": 0.8307073780898842, "grad/layer_20/attn": 0.003324960358440876, "grad/layer_20/mlp": 0.0072023384273052216, "grad/layer_20/attn_mlp_ratio": 0.46165010792472716, "grad/layer_24/attn": 0.019940132275223732, "grad/layer_24/mlp": 0.014132985845208168, "grad/layer_24/attn_mlp_ratio": 1.410893094532829, "grad/layer_27/attn": 0.012434935197234154, "grad/layer_27/mlp": 0.014903905801475048, "grad/layer_27/attn_mlp_ratio": 0.8343406942742078} {"step": 950, "timestamp": 1778326743.3587463, "train/loss": 2.422700309753418, "train/z_loss": 0.0013583158375695347, "train/perplexity": 11.276267655968093, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026184.3602353684, "perf/iters_per_sec": 0.96615999233025, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0350252628326415, "data/tokens_consumed": 1994391552, "data/tokens_consumed_B": 1.994391552, "train/loss_slope": -0.0002311457652827028} {"step": 960, "timestamp": 1778326753.713036, "train/loss": 2.420563244819641, "train/z_loss": 0.0013577411998994648, "train/perplexity": 11.25219527106311, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026424.1491286804, "perf/iters_per_sec": 0.9662743325847055, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0349027872085572, "data/tokens_consumed": 2015363072, "data/tokens_consumed_B": 2.015363072, "train/loss_slope": -0.00022932729829794828} {"step": 970, "timestamp": 1778326764.0648754, "train/loss": 2.4178810834884645, "train/z_loss": 0.001346157561056316, "train/perplexity": 11.222055505941247, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027290.4664903677, "perf/iters_per_sec": 0.9666874248935545, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0344605445861816, "data/tokens_consumed": 2036334592, "data/tokens_consumed_B": 2.036334592, "train/loss_slope": -0.00022764196683102978} {"step": 975, "timestamp": 1778326769.8187664, "eos/sharpness": 5.805039405822753, "eos/L0_probe": 2.441063404083252, "eos/L_plus": 2.4842212200164795, "eos/L_minus": 2.455955982208252, "eos/grad_norm": 0.09755609929561615, "eos/embed_grad_frac": 0.25605881214141846, "eos/time_s": 0.5920805931091309} {"step": 975, "timestamp": 1778326771.1955304, "geo/rankme_last": 422.19482421875, "geo/layer_0/stable_rank_q_proj": 19.83022117614746, "geo/layer_0/stable_rank_k_proj": 16.75116729736328, "geo/layer_0/stable_rank_o_proj": 46.39567184448242, "geo/layer_0/stable_rank_gate_proj": 129.4997100830078, "geo/layer_0/stable_rank_down_proj": 55.54357147216797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06664492934942245, "geo/layer_0/attn_entropy_mean": 6.239434719085693, "geo/layer_0/attn_entropy_std": 0.40007781982421875, "geo/layer_7/stable_rank_q_proj": 42.61000442504883, "geo/layer_7/stable_rank_k_proj": 39.90315246582031, "geo/layer_7/stable_rank_o_proj": 89.59952545166016, "geo/layer_7/stable_rank_gate_proj": 78.75799560546875, "geo/layer_7/stable_rank_down_proj": 142.03231811523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42782437801361084, "geo/layer_7/attn_entropy_mean": 4.717782974243164, "geo/layer_7/attn_entropy_std": 0.789856493473053, "geo/layer_14/stable_rank_q_proj": 50.628089904785156, "geo/layer_14/stable_rank_k_proj": 40.780155181884766, "geo/layer_14/stable_rank_o_proj": 43.18956756591797, "geo/layer_14/stable_rank_gate_proj": 71.06329345703125, "geo/layer_14/stable_rank_down_proj": 126.25294494628906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38255226612091064, "geo/layer_14/attn_entropy_mean": 5.57127046585083, "geo/layer_14/attn_entropy_std": 0.38333994150161743, "geo/layer_21/stable_rank_q_proj": 39.92737579345703, "geo/layer_21/stable_rank_k_proj": 30.081270217895508, "geo/layer_21/stable_rank_o_proj": 67.92942810058594, "geo/layer_21/stable_rank_gate_proj": 63.91047668457031, "geo/layer_21/stable_rank_down_proj": 49.868377685546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14137175679206848, "geo/layer_21/attn_entropy_mean": 5.8916192054748535, "geo/layer_21/attn_entropy_std": 0.30577898025512695, "geo/layer_27/stable_rank_q_proj": 43.82218551635742, "geo/layer_27/stable_rank_k_proj": 31.774803161621094, "geo/layer_27/stable_rank_o_proj": 115.36280059814453, "geo/layer_27/stable_rank_gate_proj": 76.7292709350586, "geo/layer_27/stable_rank_down_proj": 127.24089813232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1126311644911766, "geo/layer_27/attn_entropy_mean": 4.341850757598877, "geo/layer_27/attn_entropy_std": 0.6826940178871155, "attnres/final_alpha/block_0": 0.24600377678871155, "attnres/block_norm/0": 1.7852187156677246, "attnres/final_alpha/block_1": 0.004058695863932371, "attnres/block_norm/1": 48670.3046875, "attnres/final_alpha/block_2": 0.00918104313313961, "attnres/block_norm/2": 29365.720703125, "attnres/final_alpha/block_3": 0.01090063527226448, "attnres/block_norm/3": 65751.6953125, "attnres/final_alpha/block_4": 0.012485096231102943, "attnres/block_norm/4": 16271.76953125, "attnres/final_alpha/block_5": 0.6131176352500916, "attnres/block_norm/5": 6856.6259765625, "attnres/final_alpha/block_6": 0.10425311326980591, "attnres/block_norm/6": 44015.390625, "geo/tier1_time_s": 1.3571670055389404, "geo/step": 975.0, "geo/rankme_slope": 0.004523268479567307} {"step": 980, "timestamp": 1778326776.369843, "train/loss": 2.446461582183838, "train/z_loss": 0.00134565964108333, "train/perplexity": 11.547414766614434, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705074.6412389148, "perf/iters_per_sec": 0.813042946452577, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.2299473285675049, "data/tokens_consumed": 2057306112, "data/tokens_consumed_B": 2.057306112, "train/loss_slope": -0.00022418661757465046} {"step": 990, "timestamp": 1778326786.7174091, "train/loss": 2.449383544921875, "train/z_loss": 0.0013471683487296104, "train/perplexity": 11.58120522547245, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027748.701107216, "perf/iters_per_sec": 0.9669059281860428, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0342267751693726, "data/tokens_consumed": 2078277632, "data/tokens_consumed_B": 2.078277632, "train/loss_slope": -0.00022062772179451068} {"step": 1000, "timestamp": 1778326797.0594847, "grad/layer_0/attn": 0.003104390576481819, "grad/layer_0/mlp": 0.0033187558874487877, "grad/layer_0/attn_mlp_ratio": 0.9354079023050572, "grad/layer_4/attn": 0.0033728082198649645, "grad/layer_4/mlp": 0.002679293043911457, "grad/layer_4/attn_mlp_ratio": 1.2588425523834454, "grad/layer_8/attn": 0.0032761350739747286, "grad/layer_8/mlp": 0.003441845765337348, "grad/layer_8/attn_mlp_ratio": 0.9518540928774631, "grad/layer_12/attn": 0.005757401697337627, "grad/layer_12/mlp": 0.007074680645018816, "grad/layer_12/attn_mlp_ratio": 0.8138037467473475, "grad/layer_16/attn": 0.003900691866874695, "grad/layer_16/mlp": 0.004504936281591654, "grad/layer_16/attn_mlp_ratio": 0.865870577621016, "grad/layer_20/attn": 0.002925594337284565, "grad/layer_20/mlp": 0.0064538344740867615, "grad/layer_20/attn_mlp_ratio": 0.4533110205568828, "grad/layer_24/attn": 0.014432846568524837, "grad/layer_24/mlp": 0.011744001880288124, "grad/layer_24/attn_mlp_ratio": 1.2289547117541226, "grad/layer_27/attn": 0.006286668125540018, "grad/layer_27/mlp": 0.011876935139298439, "grad/layer_27/attn_mlp_ratio": 0.5293173700853965} {"step": 1000, "timestamp": 1778326797.0750163, "train/loss": 2.4353216171264647, "train/z_loss": 0.001355135755147785, "train/perplexity": 11.419490826653384, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025885.835402478, "perf/iters_per_sec": 0.9660176445972815, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0351777791976928, "data/tokens_consumed": 2099249152, "data/tokens_consumed_B": 2.099249152, "train/loss_slope": -0.00021105916307668517} {"step": 1000, "timestamp": 1778326803.9744194, "geo/ww_alpha_mean": 7.364359723409593, "geo/ww_alpha_std": 3.9848242331874033, "geo/ww_alpha_min": 1.3353056143865325, "geo/ww_alpha_max": 24.994947473211244, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.8938969082932933, "geo/ww_alpha_by_type/k_proj": 4.518385617589415, "geo/ww_alpha_by_type/v_proj": 7.914375655533568, "geo/ww_alpha_by_type/o_proj": 8.256059934406801, "geo/ww_alpha_by_type/gate_proj": 7.915447300129062, "geo/ww_alpha_by_type/up_proj": 11.082418182390347, "geo/ww_alpha_by_type/down_proj": 8.048824156948319, "geo/twonn_id/layer_0": 0.6929675936698914, "geo/twonn_id/layer_7": 3.6906940937042236, "geo/twonn_id/layer_14": 4.838038444519043, "geo/twonn_id/layer_21": 9.546703338623047, "geo/twonn_id/layer_27": 7.191970348358154, "geo/tier2_time_s": 6.892847776412964} {"step": 1000, "timestamp": 1778326804.7228699, "eoc/jacobian_sigma/layer_0/attn": 1434.6158447265625, "eoc/jacobian_sigma/layer_0/mlp": 9752.0263671875, "eoc/jacobian_sigma/layer_0": 9752.0263671875, "eoc/jacobian_sigma/layer_7/attn": 1.0451983213424683, "eoc/jacobian_sigma/layer_7/mlp": 1.654287338256836, "eoc/jacobian_sigma/layer_7": 1.654287338256836, "eoc/jacobian_sigma/layer_14/attn": 1.7088621854782104, "eoc/jacobian_sigma/layer_14/mlp": 15.254952430725098, "eoc/jacobian_sigma/layer_14": 15.254952430725098, "eoc/jacobian_sigma/layer_21/attn": 1.0149824619293213, "eoc/jacobian_sigma/layer_21/mlp": 4.992458343505859, "eoc/jacobian_sigma/layer_21": 4.992458343505859, "eoc/jacobian_sigma/layer_27/attn": 3.3281917572021484, "eoc/jacobian_sigma/layer_27/mlp": 25.803237915039062, "eoc/jacobian_sigma/layer_27": 25.803237915039062, "eoc/layer0_sigma": 9752.0263671875, "eoc/sigma_max": 25.803237915039062, "eoc/sigma_min": 1.654287338256836, "eoc/sigma_mean": 11.926234006881714, "eoc/time_s": 0.7424421310424805} {"step": 1010, "timestamp": 1778326815.1021938, "train/loss": 2.436655879020691, "train/z_loss": 0.0013526193448342382, "train/perplexity": 11.43473758743889, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163677.9345829412, "perf/iters_per_sec": 0.5548848793902117, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.8021756172180177, "data/tokens_consumed": 2120220672, "data/tokens_consumed_B": 2.120220672, "train/loss_slope": -0.00020654317243228688} {"step": 1020, "timestamp": 1778326825.4749498, "train/loss": 2.435658311843872, "train/z_loss": 0.0013333512702956795, "train/perplexity": 11.423336356238622, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023224.3788249313, "perf/iters_per_sec": 0.9647485632061631, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0365395069122314, "data/tokens_consumed": 2141192192, "data/tokens_consumed_B": 2.141192192, "train/loss_slope": -0.000200122453026896} {"step": 1030, "timestamp": 1778326835.858715, "train/loss": 2.367160439491272, "train/z_loss": 0.0013691838714294136, "train/perplexity": 10.667059477735648, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020635.3067064995, "perf/iters_per_sec": 0.9635139974148271, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0378676414489747, "data/tokens_consumed": 2162163712, "data/tokens_consumed_B": 2.162163712, "train/loss_slope": -0.00019711745655862125} {"step": 1040, "timestamp": 1778326846.2102406, "train/loss": 2.4584744930267335, "train/z_loss": 0.0013427071389742196, "train/perplexity": 11.686969380984975, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027024.2200810667, "perf/iters_per_sec": 0.9665604687123617, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0345964193344117, "data/tokens_consumed": 2183135232, "data/tokens_consumed_B": 2.183135232, "train/loss_slope": -0.00019067770557077375} {"step": 1050, "timestamp": 1778326856.5504813, "grad/layer_0/attn": 0.0030073560774326324, "grad/layer_0/mlp": 0.003380072768777609, "grad/layer_0/attn_mlp_ratio": 0.8897311372225628, "grad/layer_4/attn": 0.0035155992954969406, "grad/layer_4/mlp": 0.0028087173122912645, "grad/layer_4/attn_mlp_ratio": 1.2516742624631019, "grad/layer_8/attn": 0.007563476450741291, "grad/layer_8/mlp": 0.003856018418446183, "grad/layer_8/attn_mlp_ratio": 1.9614730620611907, "grad/layer_12/attn": 0.01151714101433754, "grad/layer_12/mlp": 0.00858188048005104, "grad/layer_12/attn_mlp_ratio": 1.3420299789663401, "grad/layer_16/attn": 0.004543123301118612, "grad/layer_16/mlp": 0.005423606839030981, "grad/layer_16/attn_mlp_ratio": 0.8376571813167416, "grad/layer_20/attn": 0.004511169623583555, "grad/layer_20/mlp": 0.007516752928495407, "grad/layer_20/attn_mlp_ratio": 0.6001487086887215, "grad/layer_24/attn": 0.014826908707618713, "grad/layer_24/mlp": 0.012152901850640774, "grad/layer_24/attn_mlp_ratio": 1.2200303078094816, "grad/layer_27/attn": 0.00722963223233819, "grad/layer_27/mlp": 0.011560323648154736, "grad/layer_27/attn_mlp_ratio": 0.6253831977233499} {"step": 1050, "timestamp": 1778326857.1681569, "eos/sharpness": 60.71517467498778, "eos/L0_probe": 2.437633991241455, "eos/L_plus": 2.7062299251556396, "eos/L_minus": 2.7761898040771484, "eos/grad_norm": 0.19469404220581055, "eos/embed_grad_frac": 0.08527541905641556, "eos/time_s": 0.6148900985717773} {"step": 1050, "timestamp": 1778326857.1876724, "train/loss": 2.4019100427627564, "train/z_loss": 0.0013563977321609856, "train/perplexity": 11.04425123944358, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911560.4529724484, "perf/iters_per_sec": 0.9115030541288607, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0970890283584596, "data/tokens_consumed": 2204106752, "data/tokens_consumed_B": 2.204106752, "train/loss_slope": -0.0001874853673512035} {"step": 1050, "timestamp": 1778326858.5510273, "geo/rankme_last": 422.2370910644531, "geo/layer_0/stable_rank_q_proj": 19.863006591796875, "geo/layer_0/stable_rank_k_proj": 16.7849178314209, "geo/layer_0/stable_rank_o_proj": 46.35810470581055, "geo/layer_0/stable_rank_gate_proj": 129.56939697265625, "geo/layer_0/stable_rank_down_proj": 55.5125617980957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06328139454126358, "geo/layer_0/attn_entropy_mean": 6.241225719451904, "geo/layer_0/attn_entropy_std": 0.4010600447654724, "geo/layer_7/stable_rank_q_proj": 42.54872131347656, "geo/layer_7/stable_rank_k_proj": 39.877925872802734, "geo/layer_7/stable_rank_o_proj": 89.5283203125, "geo/layer_7/stable_rank_gate_proj": 78.78724670410156, "geo/layer_7/stable_rank_down_proj": 142.47901916503906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43028298020362854, "geo/layer_7/attn_entropy_mean": 4.717693328857422, "geo/layer_7/attn_entropy_std": 0.7781875729560852, "geo/layer_14/stable_rank_q_proj": 50.643646240234375, "geo/layer_14/stable_rank_k_proj": 40.9714469909668, "geo/layer_14/stable_rank_o_proj": 43.178890228271484, "geo/layer_14/stable_rank_gate_proj": 71.26720428466797, "geo/layer_14/stable_rank_down_proj": 126.14662170410156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3793782889842987, "geo/layer_14/attn_entropy_mean": 5.51941442489624, "geo/layer_14/attn_entropy_std": 0.42364558577537537, "geo/layer_21/stable_rank_q_proj": 39.896366119384766, "geo/layer_21/stable_rank_k_proj": 30.038455963134766, "geo/layer_21/stable_rank_o_proj": 67.95653533935547, "geo/layer_21/stable_rank_gate_proj": 63.82417678833008, "geo/layer_21/stable_rank_down_proj": 49.82476043701172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13930685818195343, "geo/layer_21/attn_entropy_mean": 5.886259078979492, "geo/layer_21/attn_entropy_std": 0.3014319837093353, "geo/layer_27/stable_rank_q_proj": 43.82888412475586, "geo/layer_27/stable_rank_k_proj": 31.700834274291992, "geo/layer_27/stable_rank_o_proj": 115.2905502319336, "geo/layer_27/stable_rank_gate_proj": 76.62760162353516, "geo/layer_27/stable_rank_down_proj": 127.32978820800781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.101035937666893, "geo/layer_27/attn_entropy_mean": 4.334057331085205, "geo/layer_27/attn_entropy_std": 0.6721636056900024, "attnres/final_alpha/block_0": 0.24647146463394165, "attnres/block_norm/0": 1.7848801612854004, "attnres/final_alpha/block_1": 0.0040992246940732, "attnres/block_norm/1": 48805.5859375, "attnres/final_alpha/block_2": 0.00931820459663868, "attnres/block_norm/2": 29385.84375, "attnres/final_alpha/block_3": 0.011016305536031723, "attnres/block_norm/3": 66126.8984375, "attnres/final_alpha/block_4": 0.012528068386018276, "attnres/block_norm/4": 16260.775390625, "attnres/final_alpha/block_5": 0.6117919683456421, "attnres/block_norm/5": 6846.1923828125, "attnres/final_alpha/block_6": 0.10477478057146072, "attnres/block_norm/6": 44296.90234375, "geo/tier1_time_s": 1.3595504760742188, "geo/step": 1050.0, "geo/rankme_slope": 0.004489230201357887} {"step": 1060, "timestamp": 1778326868.9052336, "train/loss": 2.436243009567261, "train/z_loss": 0.00135285968426615, "train/perplexity": 11.4300175080364, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790312.6106381547, "perf/iters_per_sec": 0.8536875775519155, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.1713887214660645, "data/tokens_consumed": 2225078272, "data/tokens_consumed_B": 2.225078272, "train/loss_slope": -0.00018062072949047528} {"step": 1070, "timestamp": 1778326879.2570655, "train/loss": 2.4587584733963013, "train/z_loss": 0.0013483189861290157, "train/perplexity": 11.690288722160473, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026892.3608045932, "perf/iters_per_sec": 0.9664975933096853, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.034663724899292, "data/tokens_consumed": 2246049792, "data/tokens_consumed_B": 2.246049792, "train/loss_slope": -0.00017462118585439477} {"step": 1080, "timestamp": 1778326890.1153324, "train/loss": 2.437948441505432, "train/z_loss": 0.0013562376261688769, "train/perplexity": 11.449527256493853, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932473.5972123463, "perf/iters_per_sec": 0.9214752183973056, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0852163791656495, "data/tokens_consumed": 2267021312, "data/tokens_consumed_B": 2.267021312, "train/loss_slope": -0.00016512471926380412} {"step": 1090, "timestamp": 1778326900.4754822, "train/loss": 2.4723567962646484, "train/z_loss": 0.0013530609081499278, "train/perplexity": 11.850342809858068, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025655.1770770294, "perf/iters_per_sec": 0.9659076581368586, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0352956533432007, "data/tokens_consumed": 2287992832, "data/tokens_consumed_B": 2.287992832, "train/loss_slope": -0.00015582021064598067} {"step": 1100, "timestamp": 1778326910.8299463, "grad/layer_0/attn": 0.002911708317697048, "grad/layer_0/mlp": 0.0031583551317453384, "grad/layer_0/attn_mlp_ratio": 0.9219065317386763, "grad/layer_4/attn": 0.002020153682678938, "grad/layer_4/mlp": 0.002868885640054941, "grad/layer_4/attn_mlp_ratio": 0.704159686276056, "grad/layer_8/attn": 0.0033791421446949244, "grad/layer_8/mlp": 0.0035736411809921265, "grad/layer_8/attn_mlp_ratio": 0.9455739619609486, "grad/layer_12/attn": 0.007630873937159777, "grad/layer_12/mlp": 0.008241120725870132, "grad/layer_12/attn_mlp_ratio": 0.9259509839007944, "grad/layer_16/attn": 0.005879086907953024, "grad/layer_16/mlp": 0.004565585404634476, "grad/layer_16/attn_mlp_ratio": 1.2876961568204621, "grad/layer_20/attn": 0.002973027992993593, "grad/layer_20/mlp": 0.006774681154638529, "grad/layer_20/attn_mlp_ratio": 0.4388439664165754, "grad/layer_24/attn": 0.017075035721063614, "grad/layer_24/mlp": 0.010868732817471027, "grad/layer_24/attn_mlp_ratio": 1.5710235821156502, "grad/layer_27/attn": 0.010829903185367584, "grad/layer_27/mlp": 0.011732813902199268, "grad/layer_27/attn_mlp_ratio": 0.9230439674009631} {"step": 1100, "timestamp": 1778326910.855475, "train/loss": 2.441812586784363, "train/z_loss": 0.001358827925287187, "train/perplexity": 11.493855483384687, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021486.5528114787, "perf/iters_per_sec": 0.9639199031884569, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0374305963516235, "data/tokens_consumed": 2308964352, "data/tokens_consumed_B": 2.308964352, "train/loss_slope": -0.00014852424231585602} {"step": 1110, "timestamp": 1778326921.2087767, "train/loss": 2.3985150337219237, "train/z_loss": 0.0013660176075063646, "train/perplexity": 11.006819483165394, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026681.5990864823, "perf/iters_per_sec": 0.9663970942909633, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0347713232040405, "data/tokens_consumed": 2329935872, "data/tokens_consumed_B": 2.329935872, "train/loss_slope": -0.0001446802569909243} {"step": 1120, "timestamp": 1778326931.566391, "train/loss": 2.4349449634552003, "train/z_loss": 0.0013577035861089826, "train/perplexity": 11.41519044343796, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026097.1783435445, "perf/iters_per_sec": 0.9661184207647059, "perf/gpu_mem_gb": 77.837706752, "perf/step_time_s": 1.0350697994232179, "data/tokens_consumed": 2350907392, "data/tokens_consumed_B": 2.350907392, "train/loss_slope": -0.0001388404523435742} {"step": 1125, "timestamp": 1778326937.3446596, "eos/sharpness": 36.70775890350341, "eos/L0_probe": 2.4321844577789307, "eos/L_plus": 2.597276449203491, "eos/L_minus": 2.6341700553894043, "eos/grad_norm": 0.1327546089887619, "eos/embed_grad_frac": 0.13421531021595, "eos/time_s": 0.610384464263916} {"step": 1125, "timestamp": 1778326938.7248101, "geo/rankme_last": 421.7069091796875, "geo/layer_0/stable_rank_q_proj": 19.896574020385742, "geo/layer_0/stable_rank_k_proj": 16.80613899230957, "geo/layer_0/stable_rank_o_proj": 46.35763931274414, "geo/layer_0/stable_rank_gate_proj": 129.3678436279297, "geo/layer_0/stable_rank_down_proj": 55.53997039794922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.063839852809906, "geo/layer_0/attn_entropy_mean": 6.242884635925293, "geo/layer_0/attn_entropy_std": 0.4051550030708313, "geo/layer_7/stable_rank_q_proj": 42.52896499633789, "geo/layer_7/stable_rank_k_proj": 39.80421829223633, "geo/layer_7/stable_rank_o_proj": 89.44357299804688, "geo/layer_7/stable_rank_gate_proj": 78.9452896118164, "geo/layer_7/stable_rank_down_proj": 143.03053283691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42098623514175415, "geo/layer_7/attn_entropy_mean": 4.687028884887695, "geo/layer_7/attn_entropy_std": 0.7819364070892334, "geo/layer_14/stable_rank_q_proj": 50.76184844970703, "geo/layer_14/stable_rank_k_proj": 41.0202751159668, "geo/layer_14/stable_rank_o_proj": 43.17433547973633, "geo/layer_14/stable_rank_gate_proj": 71.44580841064453, "geo/layer_14/stable_rank_down_proj": 126.12200927734375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3837765157222748, "geo/layer_14/attn_entropy_mean": 5.530616283416748, "geo/layer_14/attn_entropy_std": 0.4209023416042328, "geo/layer_21/stable_rank_q_proj": 40.00579833984375, "geo/layer_21/stable_rank_k_proj": 30.02114486694336, "geo/layer_21/stable_rank_o_proj": 67.86795806884766, "geo/layer_21/stable_rank_gate_proj": 63.66233825683594, "geo/layer_21/stable_rank_down_proj": 49.878814697265625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14147822558879852, "geo/layer_21/attn_entropy_mean": 5.891541481018066, "geo/layer_21/attn_entropy_std": 0.304638147354126, "geo/layer_27/stable_rank_q_proj": 43.813140869140625, "geo/layer_27/stable_rank_k_proj": 31.621549606323242, "geo/layer_27/stable_rank_o_proj": 115.14759063720703, "geo/layer_27/stable_rank_gate_proj": 76.57649993896484, "geo/layer_27/stable_rank_down_proj": 127.27035522460938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1081448569893837, "geo/layer_27/attn_entropy_mean": 4.328271389007568, "geo/layer_27/attn_entropy_std": 0.6593008637428284, "attnres/final_alpha/block_0": 0.24661140143871307, "attnres/block_norm/0": 1.7846415042877197, "attnres/final_alpha/block_1": 0.004166669677942991, "attnres/block_norm/1": 48754.9375, "attnres/final_alpha/block_2": 0.009095286019146442, "attnres/block_norm/2": 29493.15234375, "attnres/final_alpha/block_3": 0.010913804173469543, "attnres/block_norm/3": 66346.71875, "attnres/final_alpha/block_4": 0.012401105836033821, "attnres/block_norm/4": 16253.083984375, "attnres/final_alpha/block_5": 0.6126165390014648, "attnres/block_norm/5": 6810.27001953125, "attnres/final_alpha/block_6": 0.10419519245624542, "attnres/block_norm/6": 44149.27734375, "geo/tier1_time_s": 1.358555555343628, "geo/step": 1125.0, "geo/rankme_slope": 0.004211491603477328} {"step": 1130, "timestamp": 1778326943.909464, "train/loss": 2.4079333782196044, "train/z_loss": 0.0013604415580630302, "train/perplexity": 11.110975218250266, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699804.2911735503, "perf/iters_per_sec": 0.810529847704673, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2337608575820922, "data/tokens_consumed": 2371878912, "data/tokens_consumed_B": 2.371878912, "train/loss_slope": -0.00013624235522879374} {"step": 1140, "timestamp": 1778326954.2650208, "train/loss": 2.41910662651062, "train/z_loss": 0.0013636256800964475, "train/perplexity": 11.235817048719722, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026309.3589272818, "perf/iters_per_sec": 0.9662195963512811, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349614143371582, "data/tokens_consumed": 2392850432, "data/tokens_consumed_B": 2.392850432, "train/loss_slope": -0.00013028088043732505} {"step": 1150, "timestamp": 1778326964.6172981, "grad/layer_0/attn": 0.002832630183547735, "grad/layer_0/mlp": 0.0030223610810935497, "grad/layer_0/attn_mlp_ratio": 0.9372242474748944, "grad/layer_4/attn": 0.001896439935080707, "grad/layer_4/mlp": 0.002506947610527277, "grad/layer_4/attn_mlp_ratio": 0.7564736700000161, "grad/layer_8/attn": 0.005082848947495222, "grad/layer_8/mlp": 0.0034629455767571926, "grad/layer_8/attn_mlp_ratio": 1.46778188916176, "grad/layer_12/attn": 0.005259952507913113, "grad/layer_12/mlp": 0.007095899898558855, "grad/layer_12/attn_mlp_ratio": 0.7412664368130026, "grad/layer_16/attn": 0.004526739474385977, "grad/layer_16/mlp": 0.004553992301225662, "grad/layer_16/attn_mlp_ratio": 0.994015596769035, "grad/layer_20/attn": 0.0034940990153700113, "grad/layer_20/mlp": 0.006256439257413149, "grad/layer_20/attn_mlp_ratio": 0.5584804416316947, "grad/layer_24/attn": 0.01274305209517479, "grad/layer_24/mlp": 0.008581310510635376, "grad/layer_24/attn_mlp_ratio": 1.4849773738968843, "grad/layer_27/attn": 0.005315606482326984, "grad/layer_27/mlp": 0.007184537127614021, "grad/layer_27/attn_mlp_ratio": 0.7398676231916879} {"step": 1150, "timestamp": 1778326964.6329648, "train/loss": 2.4021993398666384, "train/z_loss": 0.001373884326312691, "train/perplexity": 11.0474467715485, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023790.8460067217, "perf/iters_per_sec": 0.9650186758073434, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362493753433228, "data/tokens_consumed": 2413821952, "data/tokens_consumed_B": 2.413821952, "train/loss_slope": -0.00012631681546984656} {"step": 1160, "timestamp": 1778326974.9927511, "train/loss": 2.448247790336609, "train/z_loss": 0.0013406955520622432, "train/perplexity": 11.56805928522879, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025883.6424081412, "perf/iters_per_sec": 0.9660165988960939, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351788997650146, "data/tokens_consumed": 2434793472, "data/tokens_consumed_B": 2.434793472, "train/loss_slope": -0.00012240617745208054} {"step": 1170, "timestamp": 1778326985.346299, "train/loss": 2.451882481575012, "train/z_loss": 0.0013516736682504416, "train/perplexity": 11.610182114323418, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026612.1642648096, "perf/iters_per_sec": 0.9663639851879166, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034806776046753, "data/tokens_consumed": 2455764992, "data/tokens_consumed_B": 2.455764992, "train/loss_slope": -0.00011866244181047286} {"step": 1180, "timestamp": 1778326995.7015758, "train/loss": 2.471549940109253, "train/z_loss": 0.0013541330234147608, "train/perplexity": 11.840785144167603, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026221.979578589, "perf/iters_per_sec": 0.9661779306309648, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035006046295166, "data/tokens_consumed": 2476736512, "data/tokens_consumed_B": 2.476736512, "train/loss_slope": -0.00011125162205036574} {"step": 1190, "timestamp": 1778327006.0799131, "train/loss": 2.4491551160812377, "train/z_loss": 0.0013528714887797832, "train/perplexity": 11.57856004631872, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021647.2141584598, "perf/iters_per_sec": 0.9639965124885844, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373481512069702, "data/tokens_consumed": 2497708032, "data/tokens_consumed_B": 2.497708032, "train/loss_slope": -0.00010353181693348625} {"step": 1200, "timestamp": 1778327016.4221892, "grad/layer_0/attn": 0.003559573320671916, "grad/layer_0/mlp": 0.003611975349485874, "grad/layer_0/attn_mlp_ratio": 0.9854921137901375, "grad/layer_4/attn": 0.0016738682752475142, "grad/layer_4/mlp": 0.002450209576636553, "grad/layer_4/attn_mlp_ratio": 0.6831530751055002, "grad/layer_8/attn": 0.003119739005342126, "grad/layer_8/mlp": 0.003442235989496112, "grad/layer_8/attn_mlp_ratio": 0.9063117474312476, "grad/layer_12/attn": 0.004574882797896862, "grad/layer_12/mlp": 0.0066261147148907185, "grad/layer_12/attn_mlp_ratio": 0.6904321651076482, "grad/layer_16/attn": 0.0033522439189255238, "grad/layer_16/mlp": 0.004576472099870443, "grad/layer_16/attn_mlp_ratio": 0.7324951999097522, "grad/layer_20/attn": 0.002672144677489996, "grad/layer_20/mlp": 0.006245610769838095, "grad/layer_20/attn_mlp_ratio": 0.4278436062026494, "grad/layer_24/attn": 0.0102361049503088, "grad/layer_24/mlp": 0.010358351282775402, "grad/layer_24/attn_mlp_ratio": 0.9881982732628783, "grad/layer_27/attn": 0.006502535659819841, "grad/layer_27/mlp": 0.010610106401145458, "grad/layer_27/attn_mlp_ratio": 0.6128624306568302} {"step": 1200, "timestamp": 1778327017.0246794, "eos/sharpness": 32.967209815979, "eos/L0_probe": 2.425478458404541, "eos/L_plus": 2.6018524169921875, "eos/L_minus": 2.5787765979766846, "eos/grad_norm": 0.12548738718032837, "eos/embed_grad_frac": 0.177938774228096, "eos/time_s": 0.5996081829071045} {"step": 1200, "timestamp": 1778327017.043984, "train/loss": 2.4209067106246946, "train/z_loss": 0.0013493740814737975, "train/perplexity": 11.256060679150256, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913639.8901996545, "perf/iters_per_sec": 0.9124946070669434, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0958968877792359, "data/tokens_consumed": 2518679552, "data/tokens_consumed_B": 2.518679552, "train/loss_slope": -0.00010063103060088572} {"step": 1200, "timestamp": 1778327018.4126892, "geo/rankme_last": 422.103271484375, "geo/layer_0/stable_rank_q_proj": 19.920839309692383, "geo/layer_0/stable_rank_k_proj": 16.816349029541016, "geo/layer_0/stable_rank_o_proj": 46.454078674316406, "geo/layer_0/stable_rank_gate_proj": 129.35133361816406, "geo/layer_0/stable_rank_down_proj": 55.49038314819336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06830379366874695, "geo/layer_0/attn_entropy_mean": 6.251188278198242, "geo/layer_0/attn_entropy_std": 0.4040692448616028, "geo/layer_7/stable_rank_q_proj": 42.55228805541992, "geo/layer_7/stable_rank_k_proj": 39.91362762451172, "geo/layer_7/stable_rank_o_proj": 89.32657623291016, "geo/layer_7/stable_rank_gate_proj": 79.00096893310547, "geo/layer_7/stable_rank_down_proj": 142.69827270507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4227725565433502, "geo/layer_7/attn_entropy_mean": 4.713142395019531, "geo/layer_7/attn_entropy_std": 0.7918304204940796, "geo/layer_14/stable_rank_q_proj": 50.76615524291992, "geo/layer_14/stable_rank_k_proj": 41.09086227416992, "geo/layer_14/stable_rank_o_proj": 43.173118591308594, "geo/layer_14/stable_rank_gate_proj": 71.56066131591797, "geo/layer_14/stable_rank_down_proj": 126.11479949951172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3874257206916809, "geo/layer_14/attn_entropy_mean": 5.529428958892822, "geo/layer_14/attn_entropy_std": 0.39545342326164246, "geo/layer_21/stable_rank_q_proj": 39.968719482421875, "geo/layer_21/stable_rank_k_proj": 30.06111717224121, "geo/layer_21/stable_rank_o_proj": 67.81681823730469, "geo/layer_21/stable_rank_gate_proj": 63.70135498046875, "geo/layer_21/stable_rank_down_proj": 49.85200500488281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14596867561340332, "geo/layer_21/attn_entropy_mean": 5.886087894439697, "geo/layer_21/attn_entropy_std": 0.2968515455722809, "geo/layer_27/stable_rank_q_proj": 43.87982940673828, "geo/layer_27/stable_rank_k_proj": 31.619325637817383, "geo/layer_27/stable_rank_o_proj": 114.99102783203125, "geo/layer_27/stable_rank_gate_proj": 76.4348373413086, "geo/layer_27/stable_rank_down_proj": 127.38436889648438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10582386702299118, "geo/layer_27/attn_entropy_mean": 4.318918704986572, "geo/layer_27/attn_entropy_std": 0.6714047789573669, "attnres/final_alpha/block_0": 0.24533379077911377, "attnres/block_norm/0": 1.784197211265564, "attnres/final_alpha/block_1": 0.004069122020155191, "attnres/block_norm/1": 48908.76953125, "attnres/final_alpha/block_2": 0.009237149730324745, "attnres/block_norm/2": 29424.83984375, "attnres/final_alpha/block_3": 0.010772092267870903, "attnres/block_norm/3": 65794.484375, "attnres/final_alpha/block_4": 0.012511841021478176, "attnres/block_norm/4": 16320.4560546875, "attnres/final_alpha/block_5": 0.6146581172943115, "attnres/block_norm/5": 6905.93017578125, "attnres/final_alpha/block_6": 0.10341787338256836, "attnres/block_norm/6": 44562.390625, "geo/tier1_time_s": 1.3648879528045654, "geo/step": 1200.0, "geo/rankme_slope": 0.004041932548572815} {"step": 1210, "timestamp": 1778327029.3725805, "train/loss": 2.400365471839905, "train/z_loss": 0.001352225849404931, "train/perplexity": 11.027205777465715, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701583.816578478, "perf/iters_per_sec": 0.811378391541709, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2324705839157104, "data/tokens_consumed": 2539651072, "data/tokens_consumed_B": 2.539651072, "train/loss_slope": -9.634590256224873e-05} {"step": 1220, "timestamp": 1778327039.7370741, "train/loss": 2.3906418323516845, "train/z_loss": 0.0013606840162537991, "train/perplexity": 10.920500824905723, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024877.2793027386, "perf/iters_per_sec": 0.9655367275727933, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035693383216858, "data/tokens_consumed": 2560622592, "data/tokens_consumed_B": 2.560622592, "train/loss_slope": -9.466166358934021e-05} {"step": 1230, "timestamp": 1778327050.09115, "train/loss": 2.3993499994277956, "train/z_loss": 0.0013516610953956843, "train/perplexity": 11.016013637832325, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026665.1621689391, "perf/iters_per_sec": 0.9663892565579124, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034779715538025, "data/tokens_consumed": 2581594112, "data/tokens_consumed_B": 2.581594112, "train/loss_slope": -9.410838943467711e-05} {"step": 1240, "timestamp": 1778327060.4494815, "train/loss": 2.395455074310303, "train/z_loss": 0.001352912571746856, "train/perplexity": 10.973190540136198, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025733.4569229549, "perf/iters_per_sec": 0.9659449848761343, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352556467056275, "data/tokens_consumed": 2602565632, "data/tokens_consumed_B": 2.602565632, "train/loss_slope": -9.367570722564504e-05} {"step": 1250, "timestamp": 1778327070.7981493, "grad/layer_0/attn": 0.0032179541885852814, "grad/layer_0/mlp": 0.0031828030478209257, "grad/layer_0/attn_mlp_ratio": 1.0110440511498242, "grad/layer_4/attn": 0.0020644748583436012, "grad/layer_4/mlp": 0.002707857871428132, "grad/layer_4/attn_mlp_ratio": 0.7624014553668749, "grad/layer_8/attn": 0.004158709663897753, "grad/layer_8/mlp": 0.0035140295512974262, "grad/layer_8/attn_mlp_ratio": 1.1834588994894515, "grad/layer_12/attn": 0.007485814858227968, "grad/layer_12/mlp": 0.0062625049613416195, "grad/layer_12/attn_mlp_ratio": 1.1953387318499473, "grad/layer_16/attn": 0.0034098741598427296, "grad/layer_16/mlp": 0.0044915927574038506, "grad/layer_16/attn_mlp_ratio": 0.7591681321297772, "grad/layer_20/attn": 0.0044804830104112625, "grad/layer_20/mlp": 0.006872876081615686, "grad/layer_20/attn_mlp_ratio": 0.6519080064902295, "grad/layer_24/attn": 0.008284788578748703, "grad/layer_24/mlp": 0.009247817099094391, "grad/layer_24/attn_mlp_ratio": 0.895864223998719, "grad/layer_27/attn": 0.007735822815448046, "grad/layer_27/mlp": 0.007442059926688671, "grad/layer_27/attn_mlp_ratio": 1.0394733162196872} {"step": 1250, "timestamp": 1778327070.8139443, "train/loss": 2.4668740034103394, "train/z_loss": 0.001344244321808219, "train/perplexity": 11.785547626580346, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024851.409385727, "perf/iters_per_sec": 0.9655243918350825, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357066154479981, "data/tokens_consumed": 2623537152, "data/tokens_consumed_B": 2.623537152, "train/loss_slope": -8.917875362880372e-05} {"step": 1260, "timestamp": 1778327081.1675212, "train/loss": 2.4235493183135985, "train/z_loss": 0.0013619718607515096, "train/perplexity": 11.285845368939734, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026670.0651988257, "perf/iters_per_sec": 0.9663915945047501, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347772121429444, "data/tokens_consumed": 2644508672, "data/tokens_consumed_B": 2.644508672, "train/loss_slope": -8.864145218843174e-05} {"step": 1270, "timestamp": 1778327092.0264704, "train/loss": 2.4201969861984254, "train/z_loss": 0.0013609030866064131, "train/perplexity": 11.24807481216034, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932316.7356582414, "perf/iters_per_sec": 0.9214004209796149, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0853044748306275, "data/tokens_consumed": 2665480192, "data/tokens_consumed_B": 2.665480192, "train/loss_slope": -8.840855497731626e-05} {"step": 1275, "timestamp": 1778327097.8041344, "eos/sharpness": 56.67181015014647, "eos/L0_probe": 2.4300003051757812, "eos/L_plus": 2.6852056980133057, "eos/L_minus": 2.7415130138397217, "eos/grad_norm": 0.19591762125492096, "eos/embed_grad_frac": 0.06656795740127563, "eos/time_s": 0.6108639240264893} {"step": 1275, "timestamp": 1778327099.1931736, "geo/rankme_last": 422.3238830566406, "geo/layer_0/stable_rank_q_proj": 19.94708824157715, "geo/layer_0/stable_rank_k_proj": 16.841257095336914, "geo/layer_0/stable_rank_o_proj": 46.38294219970703, "geo/layer_0/stable_rank_gate_proj": 129.28536987304688, "geo/layer_0/stable_rank_down_proj": 55.59142303466797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0659889206290245, "geo/layer_0/attn_entropy_mean": 6.249514102935791, "geo/layer_0/attn_entropy_std": 0.4025896191596985, "geo/layer_7/stable_rank_q_proj": 42.63040542602539, "geo/layer_7/stable_rank_k_proj": 40.05493927001953, "geo/layer_7/stable_rank_o_proj": 89.49495697021484, "geo/layer_7/stable_rank_gate_proj": 79.11591339111328, "geo/layer_7/stable_rank_down_proj": 142.4727325439453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44093599915504456, "geo/layer_7/attn_entropy_mean": 4.719910144805908, "geo/layer_7/attn_entropy_std": 0.8019804358482361, "geo/layer_14/stable_rank_q_proj": 50.82465744018555, "geo/layer_14/stable_rank_k_proj": 41.02003860473633, "geo/layer_14/stable_rank_o_proj": 43.17233657836914, "geo/layer_14/stable_rank_gate_proj": 71.53721618652344, "geo/layer_14/stable_rank_down_proj": 126.0473403930664, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3816313147544861, "geo/layer_14/attn_entropy_mean": 5.548184394836426, "geo/layer_14/attn_entropy_std": 0.40405935049057007, "geo/layer_21/stable_rank_q_proj": 39.934810638427734, "geo/layer_21/stable_rank_k_proj": 30.00714683532715, "geo/layer_21/stable_rank_o_proj": 67.736328125, "geo/layer_21/stable_rank_gate_proj": 63.658138275146484, "geo/layer_21/stable_rank_down_proj": 49.879791259765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14044496417045593, "geo/layer_21/attn_entropy_mean": 5.907336235046387, "geo/layer_21/attn_entropy_std": 0.30925893783569336, "geo/layer_27/stable_rank_q_proj": 43.99400329589844, "geo/layer_27/stable_rank_k_proj": 31.550355911254883, "geo/layer_27/stable_rank_o_proj": 114.91878509521484, "geo/layer_27/stable_rank_gate_proj": 76.26667785644531, "geo/layer_27/stable_rank_down_proj": 127.4024658203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10270561277866364, "geo/layer_27/attn_entropy_mean": 4.341928958892822, "geo/layer_27/attn_entropy_std": 0.6431404948234558, "attnres/final_alpha/block_0": 0.2472861111164093, "attnres/block_norm/0": 1.7839715480804443, "attnres/final_alpha/block_1": 0.0041128844022750854, "attnres/block_norm/1": 48816.265625, "attnres/final_alpha/block_2": 0.00906863808631897, "attnres/block_norm/2": 29432.767578125, "attnres/final_alpha/block_3": 0.010673641227185726, "attnres/block_norm/3": 66475.6328125, "attnres/final_alpha/block_4": 0.012661678716540337, "attnres/block_norm/4": 16353.5517578125, "attnres/final_alpha/block_5": 0.6110416650772095, "attnres/block_norm/5": 6893.861328125, "attnres/final_alpha/block_6": 0.10515546053647995, "attnres/block_norm/6": 44394.54296875, "geo/tier1_time_s": 1.3600795269012451, "geo/step": 1275.0, "geo/rankme_slope": 0.0039036376869141295} {"step": 1280, "timestamp": 1778327104.3722868, "train/loss": 2.4149945974349976, "train/z_loss": 0.0013514542952179909, "train/perplexity": 11.189709904254288, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699454.7953588064, "perf/iters_per_sec": 0.8103631951135666, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340145826339721, "data/tokens_consumed": 2686451712, "data/tokens_consumed_B": 2.686451712, "train/loss_slope": -8.657232944649902e-05} {"step": 1290, "timestamp": 1778327114.741608, "train/loss": 2.365211820602417, "train/z_loss": 0.0013619721517898141, "train/perplexity": 10.64629368302954, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023494.0035318753, "perf/iters_per_sec": 0.9648771302852036, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036401391029358, "data/tokens_consumed": 2707423232, "data/tokens_consumed_B": 2.707423232, "train/loss_slope": -8.696389949396855e-05} {"step": 1300, "timestamp": 1778327125.092981, "grad/layer_0/attn": 0.003927829209715128, "grad/layer_0/mlp": 0.0036463807336986065, "grad/layer_0/attn_mlp_ratio": 1.0771856777590183, "grad/layer_4/attn": 0.0031559502240270376, "grad/layer_4/mlp": 0.0028217127546668053, "grad/layer_4/attn_mlp_ratio": 1.1184519426941122, "grad/layer_8/attn": 0.003853019792586565, "grad/layer_8/mlp": 0.0038928932044655085, "grad/layer_8/attn_mlp_ratio": 0.9897573581497328, "grad/layer_12/attn": 0.012195456773042679, "grad/layer_12/mlp": 0.008569588884711266, "grad/layer_12/attn_mlp_ratio": 1.4231087155755324, "grad/layer_16/attn": 0.004274423234164715, "grad/layer_16/mlp": 0.0058152079582214355, "grad/layer_16/attn_mlp_ratio": 0.7350421844531622, "grad/layer_20/attn": 0.005264083854854107, "grad/layer_20/mlp": 0.007388757076114416, "grad/layer_20/attn_mlp_ratio": 0.7124451013048952, "grad/layer_24/attn": 0.018796022981405258, "grad/layer_24/mlp": 0.013229340314865112, "grad/layer_24/attn_mlp_ratio": 1.4207830770070107, "grad/layer_27/attn": 0.014993410557508469, "grad/layer_27/mlp": 0.011315138079226017, "grad/layer_27/attn_mlp_ratio": 1.3250753388973686} {"step": 1300, "timestamp": 1778327125.1088073, "train/loss": 2.3734529495239256, "train/z_loss": 0.0013444466283544898, "train/perplexity": 10.734393684926847, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024332.7987736668, "perf/iters_per_sec": 0.9652770990246138, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359719514846801, "data/tokens_consumed": 2728394752, "data/tokens_consumed_B": 2.728394752, "train/loss_slope": -8.965608051436247e-05} {"step": 1310, "timestamp": 1778327135.4777958, "train/loss": 2.3889774322509765, "train/z_loss": 0.0013591173919849098, "train/perplexity": 10.902339859981394, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024101.003460702, "perf/iters_per_sec": 0.9651665704062948, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360905885696412, "data/tokens_consumed": 2749366272, "data/tokens_consumed_B": 2.749366272, "train/loss_slope": -9.114437732759481e-05} {"step": 1320, "timestamp": 1778327145.8311439, "train/loss": 2.4345861673355103, "train/z_loss": 0.0013610185822471976, "train/perplexity": 11.411095452078765, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026675.855474195, "perf/iters_per_sec": 0.9663943555232024, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347742557525634, "data/tokens_consumed": 2770337792, "data/tokens_consumed_B": 2.770337792, "train/loss_slope": -9.107647737582594e-05} {"step": 1330, "timestamp": 1778327156.1867528, "train/loss": 2.429220962524414, "train/z_loss": 0.0013592426665127277, "train/perplexity": 11.350036531134764, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025854.7606491502, "perf/iters_per_sec": 0.9660028270002128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351936578750611, "data/tokens_consumed": 2791309312, "data/tokens_consumed_B": 2.791309312, "train/loss_slope": -8.702185815638907e-05} {"step": 1340, "timestamp": 1778327166.5413792, "train/loss": 2.4518579483032226, "train/z_loss": 0.0013463705312460662, "train/perplexity": 11.60989728206403, "train/grad_norm": 0.33984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026514.6740384698, "perf/iters_per_sec": 0.9663174982254361, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348565578460693, "data/tokens_consumed": 2812280832, "data/tokens_consumed_B": 2.812280832, "train/loss_slope": -8.387768456716277e-05} {"step": 1350, "timestamp": 1778327176.8980906, "grad/layer_0/attn": 0.0037052887491881847, "grad/layer_0/mlp": 0.003543608821928501, "grad/layer_0/attn_mlp_ratio": 1.0456257535246565, "grad/layer_4/attn": 0.0019719828851521015, "grad/layer_4/mlp": 0.002695308532565832, "grad/layer_4/attn_mlp_ratio": 0.7316352796580657, "grad/layer_8/attn": 0.004155673552304506, "grad/layer_8/mlp": 0.0036189237143844366, "grad/layer_8/attn_mlp_ratio": 1.1483175014037614, "grad/layer_12/attn": 0.005390087142586708, "grad/layer_12/mlp": 0.007472550496459007, "grad/layer_12/attn_mlp_ratio": 0.7213182531197435, "grad/layer_16/attn": 0.004219521302729845, "grad/layer_16/mlp": 0.005167581606656313, "grad/layer_16/attn_mlp_ratio": 0.8165369300875727, "grad/layer_20/attn": 0.003367727855220437, "grad/layer_20/mlp": 0.00660285260528326, "grad/layer_20/attn_mlp_ratio": 0.5100413420589794, "grad/layer_24/attn": 0.007274119183421135, "grad/layer_24/mlp": 0.008486050181090832, "grad/layer_24/attn_mlp_ratio": 0.8571854917746363, "grad/layer_27/attn": 0.006786012556403875, "grad/layer_27/mlp": 0.008708078414201736, "grad/layer_27/attn_mlp_ratio": 0.7792778332598607} {"step": 1350, "timestamp": 1778327177.4988503, "eos/sharpness": 51.12090110778808, "eos/L0_probe": 2.4267995357513428, "eos/L_plus": 2.733368158340454, "eos/L_minus": 2.6314399242401123, "eos/grad_norm": 0.13931559026241302, "eos/embed_grad_frac": 0.1214330717921257, "eos/time_s": 0.5980935096740723} {"step": 1350, "timestamp": 1778327177.5182943, "train/loss": 2.392079734802246, "train/z_loss": 0.0013614705647341907, "train/perplexity": 10.936214734630406, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911544.9164315346, "perf/iters_per_sec": 0.911495645728843, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0970979452133178, "data/tokens_consumed": 2833252352, "data/tokens_consumed_B": 2.833252352, "train/loss_slope": -8.359493554049772e-05} {"step": 1350, "timestamp": 1778327178.8766263, "geo/rankme_last": 423.5278625488281, "geo/layer_0/stable_rank_q_proj": 19.967132568359375, "geo/layer_0/stable_rank_k_proj": 16.90518569946289, "geo/layer_0/stable_rank_o_proj": 46.30337905883789, "geo/layer_0/stable_rank_gate_proj": 129.50473022460938, "geo/layer_0/stable_rank_down_proj": 55.58127212524414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06781930476427078, "geo/layer_0/attn_entropy_mean": 6.249731063842773, "geo/layer_0/attn_entropy_std": 0.4040185213088989, "geo/layer_7/stable_rank_q_proj": 42.663002014160156, "geo/layer_7/stable_rank_k_proj": 39.97853088378906, "geo/layer_7/stable_rank_o_proj": 89.44127655029297, "geo/layer_7/stable_rank_gate_proj": 79.20828247070312, "geo/layer_7/stable_rank_down_proj": 142.5414276123047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41929835081100464, "geo/layer_7/attn_entropy_mean": 4.731900215148926, "geo/layer_7/attn_entropy_std": 0.7947020530700684, "geo/layer_14/stable_rank_q_proj": 50.697364807128906, "geo/layer_14/stable_rank_k_proj": 41.059932708740234, "geo/layer_14/stable_rank_o_proj": 43.18082046508789, "geo/layer_14/stable_rank_gate_proj": 71.37999725341797, "geo/layer_14/stable_rank_down_proj": 126.21045684814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38560518622398376, "geo/layer_14/attn_entropy_mean": 5.495720863342285, "geo/layer_14/attn_entropy_std": 0.4131968021392822, "geo/layer_21/stable_rank_q_proj": 39.99427032470703, "geo/layer_21/stable_rank_k_proj": 29.90149688720703, "geo/layer_21/stable_rank_o_proj": 67.77740478515625, "geo/layer_21/stable_rank_gate_proj": 63.68506622314453, "geo/layer_21/stable_rank_down_proj": 49.88465118408203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13928648829460144, "geo/layer_21/attn_entropy_mean": 5.883113861083984, "geo/layer_21/attn_entropy_std": 0.28926515579223633, "geo/layer_27/stable_rank_q_proj": 43.89963150024414, "geo/layer_27/stable_rank_k_proj": 31.570772171020508, "geo/layer_27/stable_rank_o_proj": 115.1201171875, "geo/layer_27/stable_rank_gate_proj": 76.2326889038086, "geo/layer_27/stable_rank_down_proj": 127.52619171142578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1073431745171547, "geo/layer_27/attn_entropy_mean": 4.332976341247559, "geo/layer_27/attn_entropy_std": 0.6564101576805115, "attnres/final_alpha/block_0": 0.2453850954771042, "attnres/block_norm/0": 1.7839288711547852, "attnres/final_alpha/block_1": 0.004087504930794239, "attnres/block_norm/1": 48881.55859375, "attnres/final_alpha/block_2": 0.008929397910833359, "attnres/block_norm/2": 29441.302734375, "attnres/final_alpha/block_3": 0.010643413290381432, "attnres/block_norm/3": 66511.359375, "attnres/final_alpha/block_4": 0.012262203730642796, "attnres/block_norm/4": 16291.529296875, "attnres/final_alpha/block_5": 0.6168533563613892, "attnres/block_norm/5": 6773.5, "attnres/final_alpha/block_6": 0.10183900594711304, "attnres/block_norm/6": 44146.22265625, "geo/tier1_time_s": 1.3545076847076416, "geo/step": 1350.0, "geo/rankme_slope": 0.003996480662920321} {"step": 1360, "timestamp": 1778327189.2395034, "train/loss": 2.434141683578491, "train/z_loss": 0.0013510852935723961, "train/perplexity": 11.406024532554689, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789764.0784246954, "perf/iters_per_sec": 0.8534260170100667, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1717477321624756, "data/tokens_consumed": 2854223872, "data/tokens_consumed_B": 2.854223872, "train/loss_slope": -8.218521614028927e-05} {"step": 1370, "timestamp": 1778327199.5890815, "train/loss": 2.4389471054077148, "train/z_loss": 0.0013489133212715386, "train/perplexity": 11.460967197440365, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027425.6954284708, "perf/iters_per_sec": 0.9667519070761065, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343915462493896, "data/tokens_consumed": 2875195392, "data/tokens_consumed_B": 2.875195392, "train/loss_slope": -7.842897311581747e-05} {"step": 1380, "timestamp": 1778327209.9422007, "train/loss": 2.396791672706604, "train/z_loss": 0.0013569097500294447, "train/perplexity": 10.987867095159439, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026522.5644215287, "perf/iters_per_sec": 0.9663212606532711, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348525285720824, "data/tokens_consumed": 2896166912, "data/tokens_consumed_B": 2.896166912, "train/loss_slope": -7.92362161488137e-05} {"step": 1390, "timestamp": 1778327220.3067482, "train/loss": 2.406249737739563, "train/z_loss": 0.0013632988557219505, "train/perplexity": 11.092284069602554, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024255.1395671878, "perf/iters_per_sec": 0.965240068229288, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360116958618164, "data/tokens_consumed": 2917138432, "data/tokens_consumed_B": 2.917138432, "train/loss_slope": -7.719571244920514e-05} {"step": 1400, "timestamp": 1778327230.6554987, "grad/layer_0/attn": 0.003496132791042328, "grad/layer_0/mlp": 0.003501511411741376, "grad/layer_0/attn_mlp_ratio": 0.998463886044352, "grad/layer_4/attn": 0.002317497506737709, "grad/layer_4/mlp": 0.002656641649082303, "grad/layer_4/attn_mlp_ratio": 0.8723409949942456, "grad/layer_8/attn": 0.004840333946049213, "grad/layer_8/mlp": 0.003935177344828844, "grad/layer_8/attn_mlp_ratio": 1.230016692744014, "grad/layer_12/attn": 0.005756048951297998, "grad/layer_12/mlp": 0.00815924908965826, "grad/layer_12/attn_mlp_ratio": 0.705463066208802, "grad/layer_16/attn": 0.0036917023826390505, "grad/layer_16/mlp": 0.004547938238829374, "grad/layer_16/attn_mlp_ratio": 0.8117309663413959, "grad/layer_20/attn": 0.0030062624718993902, "grad/layer_20/mlp": 0.006121684331446886, "grad/layer_20/attn_mlp_ratio": 0.4910841951369335, "grad/layer_24/attn": 0.008551254868507385, "grad/layer_24/mlp": 0.010136645287275314, "grad/layer_24/attn_mlp_ratio": 0.8435981078357447, "grad/layer_27/attn": 0.00810709223151207, "grad/layer_27/mlp": 0.009302386082708836, "grad/layer_27/attn_mlp_ratio": 0.8715067373338505} {"step": 1400, "timestamp": 1778327230.6710715, "train/loss": 2.4280035495758057, "train/z_loss": 0.0013453055638819934, "train/perplexity": 11.336227257194592, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024458.6873614055, "perf/iters_per_sec": 0.9653371273810413, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359075307846068, "data/tokens_consumed": 2938109952, "data/tokens_consumed_B": 2.938109952, "train/loss_slope": -7.724055303956454e-05} {"step": 1410, "timestamp": 1778327241.0363545, "train/loss": 2.425543284416199, "train/z_loss": 0.001346967916470021, "train/perplexity": 11.308371412664954, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024215.3572548137, "perf/iters_per_sec": 0.9652210985445088, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360320568084718, "data/tokens_consumed": 2959081472, "data/tokens_consumed_B": 2.959081472, "train/loss_slope": -7.437892306362444e-05} {"step": 1420, "timestamp": 1778327251.4032948, "train/loss": 2.392092728614807, "train/z_loss": 0.001356916769873351, "train/perplexity": 10.93635683867803, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024340.8585287, "perf/iters_per_sec": 0.9652809422152996, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359678268432617, "data/tokens_consumed": 2980052992, "data/tokens_consumed_B": 2.980052992, "train/loss_slope": -7.331201721875834e-05} {"step": 1425, "timestamp": 1778327257.1745918, "eos/sharpness": 68.35515499114989, "eos/L0_probe": 2.419874429702759, "eos/L_plus": 2.8428704738616943, "eos/L_minus": 2.6804299354553223, "eos/grad_norm": 0.22032903134822845, "eos/embed_grad_frac": 0.04931393638253212, "eos/time_s": 0.5952651500701904} {"step": 1425, "timestamp": 1778327258.5525463, "geo/rankme_last": 422.3092041015625, "geo/layer_0/stable_rank_q_proj": 20.024085998535156, "geo/layer_0/stable_rank_k_proj": 16.9184627532959, "geo/layer_0/stable_rank_o_proj": 46.240562438964844, "geo/layer_0/stable_rank_gate_proj": 129.604248046875, "geo/layer_0/stable_rank_down_proj": 55.59719467163086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06263794004917145, "geo/layer_0/attn_entropy_mean": 6.247864723205566, "geo/layer_0/attn_entropy_std": 0.406648188829422, "geo/layer_7/stable_rank_q_proj": 42.68766784667969, "geo/layer_7/stable_rank_k_proj": 40.08113098144531, "geo/layer_7/stable_rank_o_proj": 89.5345687866211, "geo/layer_7/stable_rank_gate_proj": 79.23355102539062, "geo/layer_7/stable_rank_down_proj": 142.91981506347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43334516882896423, "geo/layer_7/attn_entropy_mean": 4.693680763244629, "geo/layer_7/attn_entropy_std": 0.805399477481842, "geo/layer_14/stable_rank_q_proj": 50.759769439697266, "geo/layer_14/stable_rank_k_proj": 40.99737548828125, "geo/layer_14/stable_rank_o_proj": 43.13288116455078, "geo/layer_14/stable_rank_gate_proj": 71.50655364990234, "geo/layer_14/stable_rank_down_proj": 126.67750549316406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3921484351158142, "geo/layer_14/attn_entropy_mean": 5.4892120361328125, "geo/layer_14/attn_entropy_std": 0.4098954498767853, "geo/layer_21/stable_rank_q_proj": 40.09022521972656, "geo/layer_21/stable_rank_k_proj": 29.859548568725586, "geo/layer_21/stable_rank_o_proj": 67.71453857421875, "geo/layer_21/stable_rank_gate_proj": 63.647705078125, "geo/layer_21/stable_rank_down_proj": 49.896183013916016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14373570680618286, "geo/layer_21/attn_entropy_mean": 5.895023345947266, "geo/layer_21/attn_entropy_std": 0.301862508058548, "geo/layer_27/stable_rank_q_proj": 43.98106002807617, "geo/layer_27/stable_rank_k_proj": 31.483299255371094, "geo/layer_27/stable_rank_o_proj": 115.17903137207031, "geo/layer_27/stable_rank_gate_proj": 76.14213562011719, "geo/layer_27/stable_rank_down_proj": 127.37117004394531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10247098654508591, "geo/layer_27/attn_entropy_mean": 4.34317684173584, "geo/layer_27/attn_entropy_std": 0.6317051649093628, "attnres/final_alpha/block_0": 0.24478265643119812, "attnres/block_norm/0": 1.7835447788238525, "attnres/final_alpha/block_1": 0.004014467354863882, "attnres/block_norm/1": 48875.1328125, "attnres/final_alpha/block_2": 0.008877739310264587, "attnres/block_norm/2": 29371.927734375, "attnres/final_alpha/block_3": 0.010500982403755188, "attnres/block_norm/3": 66368.0, "attnres/final_alpha/block_4": 0.01237866748124361, "attnres/block_norm/4": 16247.6845703125, "attnres/final_alpha/block_5": 0.617957592010498, "attnres/block_norm/5": 6787.8046875, "attnres/final_alpha/block_6": 0.10148793458938599, "attnres/block_norm/6": 44548.3828125, "geo/tier1_time_s": 1.3594274520874023, "geo/step": 1425.0, "geo/rankme_slope": 0.0037749054031563285} {"step": 1430, "timestamp": 1778327263.7387087, "train/loss": 2.4291782140731812, "train/z_loss": 0.001343454315792769, "train/perplexity": 11.349551345022176, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701148.1762733688, "perf/iters_per_sec": 0.8111706620566219, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.232786202430725, "data/tokens_consumed": 3001024512, "data/tokens_consumed_B": 3.001024512, "train/loss_slope": -7.073101864324619e-05} {"step": 1440, "timestamp": 1778327274.101095, "train/loss": 2.3838497400283813, "train/z_loss": 0.0013597632176242768, "train/perplexity": 10.846579100856198, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024728.9208078794, "perf/iters_per_sec": 0.9654659847297093, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035769271850586, "data/tokens_consumed": 3021996032, "data/tokens_consumed_B": 3.021996032, "train/loss_slope": -6.95208944932427e-05} {"step": 1450, "timestamp": 1778327284.4523826, "grad/layer_0/attn": 0.003595771500840783, "grad/layer_0/mlp": 0.003455917350947857, "grad/layer_0/attn_mlp_ratio": 1.0404679949327402, "grad/layer_4/attn": 0.002035395707935095, "grad/layer_4/mlp": 0.002667355351150036, "grad/layer_4/attn_mlp_ratio": 0.7630762923094969, "grad/layer_8/attn": 0.004005254711955786, "grad/layer_8/mlp": 0.0034935481380671263, "grad/layer_8/attn_mlp_ratio": 1.1464718501129874, "grad/layer_12/attn": 0.006202634423971176, "grad/layer_12/mlp": 0.006983627565205097, "grad/layer_12/attn_mlp_ratio": 0.8881679724815363, "grad/layer_16/attn": 0.0036673101130872965, "grad/layer_16/mlp": 0.005073309410363436, "grad/layer_16/attn_mlp_ratio": 0.7228634692198352, "grad/layer_20/attn": 0.00405964395031333, "grad/layer_20/mlp": 0.00625341571867466, "grad/layer_20/attn_mlp_ratio": 0.6491882305651194, "grad/layer_24/attn": 0.004716569557785988, "grad/layer_24/mlp": 0.007650458719581366, "grad/layer_24/attn_mlp_ratio": 0.6165080642894147, "grad/layer_27/attn": 0.005402426701039076, "grad/layer_27/mlp": 0.0074430350214242935, "grad/layer_27/attn_mlp_ratio": 0.7258365186922928} {"step": 1450, "timestamp": 1778327284.4679258, "train/loss": 2.457877993583679, "train/z_loss": 0.0013477061758749187, "train/perplexity": 11.680000189024428, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024036.0767517593, "perf/iters_per_sec": 0.9651356109389111, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361238241195678, "data/tokens_consumed": 3042967552, "data/tokens_consumed_B": 3.042967552, "train/loss_slope": -6.243017072474464e-05} {"step": 1460, "timestamp": 1778327294.8204079, "train/loss": 2.4049731254577638, "train/z_loss": 0.0013570444076322018, "train/perplexity": 11.078132558444507, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026627.3862865777, "perf/iters_per_sec": 0.9663712436135186, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347990036010741, "data/tokens_consumed": 3063939072, "data/tokens_consumed_B": 3.063939072, "train/loss_slope": -6.192561287512268e-05} {"step": 1470, "timestamp": 1778327305.1791513, "train/loss": 2.412216138839722, "train/z_loss": 0.00136143967974931, "train/perplexity": 11.158662909959379, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025495.9303201768, "perf/iters_per_sec": 0.965831723365868, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353770494461059, "data/tokens_consumed": 3084910592, "data/tokens_consumed_B": 3.084910592, "train/loss_slope": -6.152417527423977e-05} {"step": 1480, "timestamp": 1778327315.539512, "train/loss": 2.4231859683990478, "train/z_loss": 0.0013460308313369752, "train/perplexity": 11.281745402895561, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025532.0315201322, "perf/iters_per_sec": 0.9658489377594625, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353585958480835, "data/tokens_consumed": 3105882112, "data/tokens_consumed_B": 3.105882112, "train/loss_slope": -6.0382980274574735e-05} {"step": 1490, "timestamp": 1778327325.8985293, "train/loss": 2.412117528915405, "train/z_loss": 0.001358474069274962, "train/perplexity": 11.157562609305526, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025481.0051354703, "perf/iters_per_sec": 0.9658246064832069, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353846788406371, "data/tokens_consumed": 3126853632, "data/tokens_consumed_B": 3.126853632, "train/loss_slope": -5.706308552570046e-05} {"step": 1500, "timestamp": 1778327336.2558386, "grad/layer_0/attn": 0.0027033931110054255, "grad/layer_0/mlp": 0.0029816022142767906, "grad/layer_0/attn_mlp_ratio": 0.9066913780086574, "grad/layer_4/attn": 0.0025121085345745087, "grad/layer_4/mlp": 0.0025210389867424965, "grad/layer_4/attn_mlp_ratio": 0.9964575907549584, "grad/layer_8/attn": 0.0043466039933264256, "grad/layer_8/mlp": 0.003215002128854394, "grad/layer_8/attn_mlp_ratio": 1.351975421452587, "grad/layer_12/attn": 0.005615397822111845, "grad/layer_12/mlp": 0.007087592501193285, "grad/layer_12/attn_mlp_ratio": 0.7922856374626305, "grad/layer_16/attn": 0.004134009592235088, "grad/layer_16/mlp": 0.005008343607187271, "grad/layer_16/attn_mlp_ratio": 0.8254244983830761, "grad/layer_20/attn": 0.005945803131908178, "grad/layer_20/mlp": 0.006732648238539696, "grad/layer_20/attn_mlp_ratio": 0.8831299115791658, "grad/layer_24/attn": 0.00516200577840209, "grad/layer_24/mlp": 0.007777020800858736, "grad/layer_24/attn_mlp_ratio": 0.6637510486608186, "grad/layer_27/attn": 0.005347646772861481, "grad/layer_27/mlp": 0.007236916106194258, "grad/layer_27/attn_mlp_ratio": 0.7389399877649953} {"step": 1500, "timestamp": 1778327336.8897765, "eos/sharpness": 30.232763290405266, "eos/L0_probe": 2.4198296070098877, "eos/L_plus": 2.6124091148376465, "eos/L_minus": 2.5295777320861816, "eos/grad_norm": 0.1074455976486206, "eos/embed_grad_frac": 0.20992238819599152, "eos/time_s": 0.6309952735900879} {"step": 1500, "timestamp": 1778327336.910864, "train/loss": 2.3707064151763917, "train/z_loss": 0.0013578381040133536, "train/perplexity": 10.70495175411617, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905384.5807209397, "perf/iters_per_sec": 0.9085581687550257, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1006449937820435, "data/tokens_consumed": 3147825152, "data/tokens_consumed_B": 3.147825152, "train/loss_slope": -5.878131064144016e-05} {"step": 1500, "timestamp": 1778327338.2740326, "geo/rankme_last": 422.30780029296875, "geo/layer_0/stable_rank_q_proj": 20.089099884033203, "geo/layer_0/stable_rank_k_proj": 16.93480682373047, "geo/layer_0/stable_rank_o_proj": 46.25442123413086, "geo/layer_0/stable_rank_gate_proj": 129.61732482910156, "geo/layer_0/stable_rank_down_proj": 55.59941482543945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06588900089263916, "geo/layer_0/attn_entropy_mean": 6.25449275970459, "geo/layer_0/attn_entropy_std": 0.40421241521835327, "geo/layer_7/stable_rank_q_proj": 42.69042205810547, "geo/layer_7/stable_rank_k_proj": 40.1383171081543, "geo/layer_7/stable_rank_o_proj": 89.29743194580078, "geo/layer_7/stable_rank_gate_proj": 79.28196716308594, "geo/layer_7/stable_rank_down_proj": 142.7028350830078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42268386483192444, "geo/layer_7/attn_entropy_mean": 4.678974151611328, "geo/layer_7/attn_entropy_std": 0.7810505628585815, "geo/layer_14/stable_rank_q_proj": 50.7093620300293, "geo/layer_14/stable_rank_k_proj": 41.099822998046875, "geo/layer_14/stable_rank_o_proj": 43.05465316772461, "geo/layer_14/stable_rank_gate_proj": 71.55238342285156, "geo/layer_14/stable_rank_down_proj": 127.08126831054688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37374451756477356, "geo/layer_14/attn_entropy_mean": 5.500985145568848, "geo/layer_14/attn_entropy_std": 0.4134328067302704, "geo/layer_21/stable_rank_q_proj": 40.06124496459961, "geo/layer_21/stable_rank_k_proj": 29.831775665283203, "geo/layer_21/stable_rank_o_proj": 67.55206298828125, "geo/layer_21/stable_rank_gate_proj": 63.554847717285156, "geo/layer_21/stable_rank_down_proj": 49.96126174926758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14104072749614716, "geo/layer_21/attn_entropy_mean": 5.8912811279296875, "geo/layer_21/attn_entropy_std": 0.31465819478034973, "geo/layer_27/stable_rank_q_proj": 43.99234390258789, "geo/layer_27/stable_rank_k_proj": 31.409366607666016, "geo/layer_27/stable_rank_o_proj": 115.200927734375, "geo/layer_27/stable_rank_gate_proj": 76.0689697265625, "geo/layer_27/stable_rank_down_proj": 127.50276947021484, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11476937681436539, "geo/layer_27/attn_entropy_mean": 4.311779499053955, "geo/layer_27/attn_entropy_std": 0.6214499473571777, "attnres/final_alpha/block_0": 0.24626624584197998, "attnres/block_norm/0": 1.7830654382705688, "attnres/final_alpha/block_1": 0.00401287991553545, "attnres/block_norm/1": 48754.32421875, "attnres/final_alpha/block_2": 0.008998721837997437, "attnres/block_norm/2": 29538.802734375, "attnres/final_alpha/block_3": 0.01082786824554205, "attnres/block_norm/3": 66715.921875, "attnres/final_alpha/block_4": 0.012345334514975548, "attnres/block_norm/4": 16323.947265625, "attnres/final_alpha/block_5": 0.6153477430343628, "attnres/block_norm/5": 6846.71435546875, "attnres/final_alpha/block_6": 0.10220123082399368, "attnres/block_norm/6": 44909.7421875, "geo/tier1_time_s": 1.3586366176605225, "geo/step": 1500.0, "geo/rankme_slope": 0.003561614329680736} {"step": 1500, "timestamp": 1778327345.251756, "geo/ww_alpha_mean": 7.277179540616277, "geo/ww_alpha_std": 3.830168083490474, "geo/ww_alpha_min": 1.3453401259502495, "geo/ww_alpha_max": 25.078637380219842, "geo/ww_alpha_healthy_frac": 0.18781725888324874, "geo/ww_alpha_by_type/q_proj": 3.9229509555855198, "geo/ww_alpha_by_type/k_proj": 4.5377589320085905, "geo/ww_alpha_by_type/v_proj": 7.621464339135328, "geo/ww_alpha_by_type/o_proj": 8.513237044567617, "geo/ww_alpha_by_type/gate_proj": 7.871831268687136, "geo/ww_alpha_by_type/up_proj": 10.600307557251764, "geo/ww_alpha_by_type/down_proj": 7.957199462833574, "geo/twonn_id/layer_0": 0.7704127430915833, "geo/twonn_id/layer_7": 3.553037405014038, "geo/twonn_id/layer_14": 5.405167102813721, "geo/twonn_id/layer_21": 9.517290115356445, "geo/twonn_id/layer_27": 6.154417991638184, "geo/tier2_time_s": 6.949798583984375} {"step": 1500, "timestamp": 1778327346.008453, "eoc/jacobian_sigma/layer_0/attn": 1199.78662109375, "eoc/jacobian_sigma/layer_0/mlp": 9426.72265625, "eoc/jacobian_sigma/layer_0": 9426.72265625, "eoc/jacobian_sigma/layer_7/attn": 1.0555951595306396, "eoc/jacobian_sigma/layer_7/mlp": 1.7299375534057617, "eoc/jacobian_sigma/layer_7": 1.7299375534057617, "eoc/jacobian_sigma/layer_14/attn": 1.860052466392517, "eoc/jacobian_sigma/layer_14/mlp": 11.184298515319824, "eoc/jacobian_sigma/layer_14": 11.184298515319824, "eoc/jacobian_sigma/layer_21/attn": 1.0201064348220825, "eoc/jacobian_sigma/layer_21/mlp": 5.188093185424805, "eoc/jacobian_sigma/layer_21": 5.188093185424805, "eoc/jacobian_sigma/layer_27/attn": 3.5209720134735107, "eoc/jacobian_sigma/layer_27/mlp": 25.082239151000977, "eoc/jacobian_sigma/layer_27": 25.082239151000977, "eoc/layer0_sigma": 9426.72265625, "eoc/sigma_max": 25.082239151000977, "eoc/sigma_min": 1.7299375534057617, "eoc/sigma_mean": 10.796142101287842, "eoc/time_s": 0.7500674724578857} {"step": 1510, "timestamp": 1778327356.3849943, "train/loss": 2.4357882261276247, "train/z_loss": 0.0013449980528093875, "train/perplexity": 11.424820507203524, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1077134.864566479, "perf/iters_per_sec": 0.5136179278213878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9469725370407105, "data/tokens_consumed": 3168796672, "data/tokens_consumed_B": 3.168796672, "train/loss_slope": -5.6777258002289504e-05} {"step": 1520, "timestamp": 1778327367.413178, "train/loss": 2.406293773651123, "train/z_loss": 0.0013582000625319778, "train/perplexity": 11.092772539197863, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1902794.4135896675, "perf/iters_per_sec": 0.9073230808208788, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1021432399749755, "data/tokens_consumed": 3189768192, "data/tokens_consumed_B": 3.189768192, "train/loss_slope": -5.540640173178697e-05} {"step": 1530, "timestamp": 1778327377.7869105, "train/loss": 2.42839720249176, "train/z_loss": 0.0013428006088361144, "train/perplexity": 11.340690674571302, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022817.494695277, "perf/iters_per_sec": 0.9645545457340607, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367480039596557, "data/tokens_consumed": 3210739712, "data/tokens_consumed_B": 3.210739712, "train/loss_slope": -5.3952898136531504e-05} {"step": 1540, "timestamp": 1778327388.1522362, "train/loss": 2.4516770601272584, "train/z_loss": 0.0013503226451575755, "train/perplexity": 11.607797378851101, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024292.8736503418, "perf/iters_per_sec": 0.9652580612422665, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359923839569092, "data/tokens_consumed": 3231711232, "data/tokens_consumed_B": 3.231711232, "train/loss_slope": -5.160511768225467e-05} {"step": 1550, "timestamp": 1778327398.5056176, "grad/layer_0/attn": 0.0026810290291905403, "grad/layer_0/mlp": 0.003056358778849244, "grad/layer_0/attn_mlp_ratio": 0.8771970620806088, "grad/layer_4/attn": 0.003082736860960722, "grad/layer_4/mlp": 0.0028692791238427162, "grad/layer_4/attn_mlp_ratio": 1.07439416678037, "grad/layer_8/attn": 0.0032088940497487783, "grad/layer_8/mlp": 0.0035320501774549484, "grad/layer_8/attn_mlp_ratio": 0.9085074666776761, "grad/layer_12/attn": 0.007685528602451086, "grad/layer_12/mlp": 0.007895332761108875, "grad/layer_12/attn_mlp_ratio": 0.9734268000667521, "grad/layer_16/attn": 0.00391190592199564, "grad/layer_16/mlp": 0.004735724534839392, "grad/layer_16/attn_mlp_ratio": 0.8260416775960432, "grad/layer_20/attn": 0.0031779322307556868, "grad/layer_20/mlp": 0.0068386574275791645, "grad/layer_20/attn_mlp_ratio": 0.46470118118060694, "grad/layer_24/attn": 0.016784997656941414, "grad/layer_24/mlp": 0.012429394759237766, "grad/layer_24/attn_mlp_ratio": 1.350427582921825, "grad/layer_27/attn": 0.012002344243228436, "grad/layer_27/mlp": 0.014576172456145287, "grad/layer_27/attn_mlp_ratio": 0.8234222116263484} {"step": 1550, "timestamp": 1778327398.5215375, "train/loss": 2.4285499811172486, "train/z_loss": 0.0013432492967694997, "train/perplexity": 11.342423422064673, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023777.9946859134, "perf/iters_per_sec": 0.96501254782005, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036255955696106, "data/tokens_consumed": 3252682752, "data/tokens_consumed_B": 3.252682752, "train/loss_slope": -5.120265680571199e-05} {"step": 1560, "timestamp": 1778327408.891932, "train/loss": 2.367576503753662, "train/z_loss": 0.001357082254253328, "train/perplexity": 10.671498583381693, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024150.1437320102, "perf/iters_per_sec": 0.9651900023136187, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036065435409546, "data/tokens_consumed": 3273654272, "data/tokens_consumed_B": 3.273654272, "train/loss_slope": -5.034003792816263e-05} {"step": 1570, "timestamp": 1778327419.248175, "train/loss": 2.417421054840088, "train/z_loss": 0.0013503507478162646, "train/perplexity": 11.216894226174142, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026013.2238594485, "perf/iters_per_sec": 0.9660783881470911, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035112690925598, "data/tokens_consumed": 3294625792, "data/tokens_consumed_B": 3.294625792, "train/loss_slope": -5.1634503355120654e-05} {"step": 1575, "timestamp": 1778327425.032906, "eos/sharpness": 60.78591346740721, "eos/L0_probe": 2.4142744541168213, "eos/L_plus": 2.672264814376831, "eos/L_minus": 2.764143228530884, "eos/grad_norm": 0.16579562425613403, "eos/embed_grad_frac": 0.08187586814165115, "eos/time_s": 0.6099550724029541} {"step": 1575, "timestamp": 1778327426.4138649, "geo/rankme_last": 422.3760681152344, "geo/layer_0/stable_rank_q_proj": 20.077810287475586, "geo/layer_0/stable_rank_k_proj": 16.908000946044922, "geo/layer_0/stable_rank_o_proj": 46.19306945800781, "geo/layer_0/stable_rank_gate_proj": 129.60635375976562, "geo/layer_0/stable_rank_down_proj": 55.663150787353516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06474463641643524, "geo/layer_0/attn_entropy_mean": 6.251786231994629, "geo/layer_0/attn_entropy_std": 0.40294647216796875, "geo/layer_7/stable_rank_q_proj": 42.660423278808594, "geo/layer_7/stable_rank_k_proj": 40.041481018066406, "geo/layer_7/stable_rank_o_proj": 89.35203552246094, "geo/layer_7/stable_rank_gate_proj": 79.20386505126953, "geo/layer_7/stable_rank_down_proj": 142.9119415283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43095460534095764, "geo/layer_7/attn_entropy_mean": 4.69373083114624, "geo/layer_7/attn_entropy_std": 0.7878755331039429, "geo/layer_14/stable_rank_q_proj": 50.82583999633789, "geo/layer_14/stable_rank_k_proj": 41.09858703613281, "geo/layer_14/stable_rank_o_proj": 43.02798080444336, "geo/layer_14/stable_rank_gate_proj": 71.72515869140625, "geo/layer_14/stable_rank_down_proj": 127.10863494873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3857470750808716, "geo/layer_14/attn_entropy_mean": 5.519686698913574, "geo/layer_14/attn_entropy_std": 0.42110762000083923, "geo/layer_21/stable_rank_q_proj": 40.08087158203125, "geo/layer_21/stable_rank_k_proj": 29.762104034423828, "geo/layer_21/stable_rank_o_proj": 67.52716064453125, "geo/layer_21/stable_rank_gate_proj": 63.59317398071289, "geo/layer_21/stable_rank_down_proj": 50.00697326660156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13958588242530823, "geo/layer_21/attn_entropy_mean": 5.890045642852783, "geo/layer_21/attn_entropy_std": 0.3166675567626953, "geo/layer_27/stable_rank_q_proj": 44.01511001586914, "geo/layer_27/stable_rank_k_proj": 31.311052322387695, "geo/layer_27/stable_rank_o_proj": 114.92330932617188, "geo/layer_27/stable_rank_gate_proj": 75.93245697021484, "geo/layer_27/stable_rank_down_proj": 127.29327392578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1110353022813797, "geo/layer_27/attn_entropy_mean": 4.328145980834961, "geo/layer_27/attn_entropy_std": 0.650894820690155, "attnres/final_alpha/block_0": 0.24930568039417267, "attnres/block_norm/0": 1.782702922821045, "attnres/final_alpha/block_1": 0.004171347711235285, "attnres/block_norm/1": 48924.10546875, "attnres/final_alpha/block_2": 0.009036549367010593, "attnres/block_norm/2": 29403.77734375, "attnres/final_alpha/block_3": 0.010789177380502224, "attnres/block_norm/3": 66233.4375, "attnres/final_alpha/block_4": 0.01259426586329937, "attnres/block_norm/4": 16338.544921875, "attnres/final_alpha/block_5": 0.6076254844665527, "attnres/block_norm/5": 6908.1650390625, "attnres/final_alpha/block_6": 0.1064775139093399, "attnres/block_norm/6": 44232.3359375, "geo/tier1_time_s": 1.3610525131225586, "geo/step": 1575.0, "geo/rankme_slope": 0.0033699960132428713} {"step": 1580, "timestamp": 1778327431.6119874, "train/loss": 2.3751232624053955, "train/z_loss": 0.0013600101112388075, "train/perplexity": 10.752338463498873, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697168.0365374263, "perf/iters_per_sec": 0.8092727835356838, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2356772899627686, "data/tokens_consumed": 3315597312, "data/tokens_consumed_B": 3.315597312, "train/loss_slope": -5.294611729411007e-05} {"step": 1590, "timestamp": 1778327441.9729834, "train/loss": 2.403574228286743, "train/z_loss": 0.0013482811278663575, "train/perplexity": 11.062646224568084, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025147.8576466884, "perf/iters_per_sec": 0.9656657493813936, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355550050735474, "data/tokens_consumed": 3336568832, "data/tokens_consumed_B": 3.336568832, "train/loss_slope": -4.996792957513064e-05} {"step": 1600, "timestamp": 1778327452.3243802, "grad/layer_0/attn": 0.0026113507337868214, "grad/layer_0/mlp": 0.0030590626411139965, "grad/layer_0/attn_mlp_ratio": 0.8536440585837297, "grad/layer_4/attn": 0.001811513677239418, "grad/layer_4/mlp": 0.0025634136982262135, "grad/layer_4/attn_mlp_ratio": 0.706680161623893, "grad/layer_8/attn": 0.003683471353724599, "grad/layer_8/mlp": 0.0034717281814664602, "grad/layer_8/attn_mlp_ratio": 1.0609906810358725, "grad/layer_12/attn": 0.00608892971649766, "grad/layer_12/mlp": 0.006598376668989658, "grad/layer_12/attn_mlp_ratio": 0.922792063816931, "grad/layer_16/attn": 0.004376962315291166, "grad/layer_16/mlp": 0.004560953937470913, "grad/layer_16/attn_mlp_ratio": 0.9596593781327006, "grad/layer_20/attn": 0.003079162910580635, "grad/layer_20/mlp": 0.00578346336260438, "grad/layer_20/attn_mlp_ratio": 0.5324081202363199, "grad/layer_24/attn": 0.00515408581122756, "grad/layer_24/mlp": 0.008684690110385418, "grad/layer_24/attn_mlp_ratio": 0.5934680093786359, "grad/layer_27/attn": 0.008517192676663399, "grad/layer_27/mlp": 0.008288576267659664, "grad/layer_27/attn_mlp_ratio": 1.027582095991267} {"step": 1600, "timestamp": 1778327452.3399894, "train/loss": 2.4505481243133547, "train/z_loss": 0.0013449020916596055, "train/perplexity": 11.59470031493285, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023971.3870647415, "perf/iters_per_sec": 0.9651047644923885, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361569404602051, "data/tokens_consumed": 3357540352, "data/tokens_consumed_B": 3.357540352, "train/loss_slope": -4.7587423942627704e-05} {"step": 1610, "timestamp": 1778327462.7143314, "train/loss": 2.4125162839889525, "train/z_loss": 0.0013467845623381437, "train/perplexity": 11.162012631179843, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022979.6703980924, "perf/iters_per_sec": 0.9646318771353208, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036664891242981, "data/tokens_consumed": 3378511872, "data/tokens_consumed_B": 3.378511872, "train/loss_slope": -4.773975720535768e-05} {"step": 1620, "timestamp": 1778327473.070925, "train/loss": 2.3877921104431152, "train/z_loss": 0.0013487884891219438, "train/perplexity": 10.889424734590664, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026062.08368285, "perf/iters_per_sec": 0.9661016863264322, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350877285003661, "data/tokens_consumed": 3399483392, "data/tokens_consumed_B": 3.399483392, "train/loss_slope": -4.6514483711364e-05} {"step": 1630, "timestamp": 1778327483.4275162, "train/loss": 2.408043909072876, "train/z_loss": 0.001348179834894836, "train/perplexity": 11.112203391696095, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025965.48630811, "perf/iters_per_sec": 0.9660556251087713, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351370811462401, "data/tokens_consumed": 3420454912, "data/tokens_consumed_B": 3.420454912, "train/loss_slope": -4.5179213227147e-05} {"step": 1640, "timestamp": 1778327493.7874591, "train/loss": 2.407554841041565, "train/z_loss": 0.001345782319549471, "train/perplexity": 11.106770096993463, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025643.888110934, "perf/iters_per_sec": 0.9659022751383467, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035301423072815, "data/tokens_consumed": 3441426432, "data/tokens_consumed_B": 3.441426432, "train/loss_slope": -4.522408759526007e-05} {"step": 1650, "timestamp": 1778327504.123637, "grad/layer_0/attn": 0.00311569357290864, "grad/layer_0/mlp": 0.0032899973448365927, "grad/layer_0/attn_mlp_ratio": 0.947020058571319, "grad/layer_4/attn": 0.002239801688119769, "grad/layer_4/mlp": 0.002741293516010046, "grad/layer_4/attn_mlp_ratio": 0.8170601190031593, "grad/layer_8/attn": 0.0036806976422667503, "grad/layer_8/mlp": 0.0036002458073198795, "grad/layer_8/attn_mlp_ratio": 1.022346177738387, "grad/layer_12/attn": 0.005572770722210407, "grad/layer_12/mlp": 0.007092413958162069, "grad/layer_12/attn_mlp_ratio": 0.7857367994184109, "grad/layer_16/attn": 0.0036759201902896166, "grad/layer_16/mlp": 0.004777615889906883, "grad/layer_16/attn_mlp_ratio": 0.7694046985055535, "grad/layer_20/attn": 0.0031992122530937195, "grad/layer_20/mlp": 0.006054427940398455, "grad/layer_20/attn_mlp_ratio": 0.5284086674656675, "grad/layer_24/attn": 0.010916469618678093, "grad/layer_24/mlp": 0.010679518803954124, "grad/layer_24/attn_mlp_ratio": 1.022187395973075, "grad/layer_27/attn": 0.0076299142092466354, "grad/layer_27/mlp": 0.010797752998769283, "grad/layer_27/attn_mlp_ratio": 0.7066205477615788} {"step": 1650, "timestamp": 1778327504.7268531, "eos/sharpness": 64.71939086914061, "eos/L0_probe": 2.413055896759033, "eos/L_plus": 2.8055832386016846, "eos/L_minus": 2.667722463607788, "eos/grad_norm": 0.18543609976768494, "eos/embed_grad_frac": 0.06458709388971329, "eos/time_s": 0.6004831790924072} {"step": 1650, "timestamp": 1778327504.7472677, "train/loss": 2.424547815322876, "train/z_loss": 0.0013482966111041605, "train/perplexity": 11.297119879633671, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914481.8995133555, "perf/iters_per_sec": 0.9128961083952691, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.095414900779724, "data/tokens_consumed": 3462397952, "data/tokens_consumed_B": 3.462397952, "train/loss_slope": -4.3621944687297106e-05} {"step": 1650, "timestamp": 1778327506.1093106, "geo/rankme_last": 423.2621765136719, "geo/layer_0/stable_rank_q_proj": 20.100988388061523, "geo/layer_0/stable_rank_k_proj": 16.950679779052734, "geo/layer_0/stable_rank_o_proj": 46.1221809387207, "geo/layer_0/stable_rank_gate_proj": 129.78070068359375, "geo/layer_0/stable_rank_down_proj": 55.723793029785156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06423281133174896, "geo/layer_0/attn_entropy_mean": 6.2521257400512695, "geo/layer_0/attn_entropy_std": 0.40961623191833496, "geo/layer_7/stable_rank_q_proj": 42.64131546020508, "geo/layer_7/stable_rank_k_proj": 39.99592208862305, "geo/layer_7/stable_rank_o_proj": 89.25662994384766, "geo/layer_7/stable_rank_gate_proj": 79.15858459472656, "geo/layer_7/stable_rank_down_proj": 143.03878784179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42488762736320496, "geo/layer_7/attn_entropy_mean": 4.690124988555908, "geo/layer_7/attn_entropy_std": 0.7682454586029053, "geo/layer_14/stable_rank_q_proj": 50.834598541259766, "geo/layer_14/stable_rank_k_proj": 41.10316467285156, "geo/layer_14/stable_rank_o_proj": 42.975528717041016, "geo/layer_14/stable_rank_gate_proj": 71.8002700805664, "geo/layer_14/stable_rank_down_proj": 126.78923797607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37272462248802185, "geo/layer_14/attn_entropy_mean": 5.529172897338867, "geo/layer_14/attn_entropy_std": 0.4112994968891144, "geo/layer_21/stable_rank_q_proj": 40.07194137573242, "geo/layer_21/stable_rank_k_proj": 29.794322967529297, "geo/layer_21/stable_rank_o_proj": 67.43383026123047, "geo/layer_21/stable_rank_gate_proj": 63.592411041259766, "geo/layer_21/stable_rank_down_proj": 49.979164123535156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1431024819612503, "geo/layer_21/attn_entropy_mean": 5.896372318267822, "geo/layer_21/attn_entropy_std": 0.310115247964859, "geo/layer_27/stable_rank_q_proj": 44.028106689453125, "geo/layer_27/stable_rank_k_proj": 31.27501678466797, "geo/layer_27/stable_rank_o_proj": 114.74566650390625, "geo/layer_27/stable_rank_gate_proj": 75.84749603271484, "geo/layer_27/stable_rank_down_proj": 127.4616470336914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1098410040140152, "geo/layer_27/attn_entropy_mean": 4.329860687255859, "geo/layer_27/attn_entropy_std": 0.6628031730651855, "attnres/final_alpha/block_0": 0.24466446042060852, "attnres/block_norm/0": 1.7823426723480225, "attnres/final_alpha/block_1": 0.004064138047397137, "attnres/block_norm/1": 48989.4140625, "attnres/final_alpha/block_2": 0.00872185081243515, "attnres/block_norm/2": 29584.49609375, "attnres/final_alpha/block_3": 0.01049533486366272, "attnres/block_norm/3": 67189.734375, "attnres/final_alpha/block_4": 0.012197492644190788, "attnres/block_norm/4": 16397.318359375, "attnres/final_alpha/block_5": 0.6172126531600952, "attnres/block_norm/5": 6755.865234375, "attnres/final_alpha/block_6": 0.10264404118061066, "attnres/block_norm/6": 44389.37109375, "geo/tier1_time_s": 1.3582122325897217, "geo/step": 1650.0, "geo/rankme_slope": 0.00331599058369874} {"step": 1660, "timestamp": 1778327516.4631708, "train/loss": 2.4442567110061644, "train/z_loss": 0.0013563197338953615, "train/perplexity": 11.52198225263067, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790565.351637891, "perf/iters_per_sec": 0.8538080938519912, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1712233781814576, "data/tokens_consumed": 3483369472, "data/tokens_consumed_B": 3.483369472, "train/loss_slope": -3.98970418530043e-05} {"step": 1670, "timestamp": 1778327526.8328373, "train/loss": 2.381760668754578, "train/z_loss": 0.0013596991077065468, "train/perplexity": 10.823943475986635, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023427.7193664254, "perf/iters_per_sec": 0.9648455235321166, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036435341835022, "data/tokens_consumed": 3504340992, "data/tokens_consumed_B": 3.504340992, "train/loss_slope": -4.111194318742178e-05} {"step": 1680, "timestamp": 1778327537.191514, "train/loss": 2.4168045043945314, "train/z_loss": 0.001352185453288257, "train/perplexity": 11.209980576567116, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025528.2067842549, "perf/iters_per_sec": 0.9658471139832758, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353605508804322, "data/tokens_consumed": 3525312512, "data/tokens_consumed_B": 3.525312512, "train/loss_slope": -3.844758502148739e-05} {"step": 1690, "timestamp": 1778327547.550451, "train/loss": 2.3544247150421143, "train/z_loss": 0.0013521795975975693, "train/perplexity": 10.532068178154269, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025469.6248299319, "perf/iters_per_sec": 0.9658191799306545, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353904962539673, "data/tokens_consumed": 3546284032, "data/tokens_consumed_B": 3.546284032, "train/loss_slope": -3.932517931835451e-05} {"step": 1700, "timestamp": 1778327557.896406, "grad/layer_0/attn": 0.002980646211653948, "grad/layer_0/mlp": 0.003240876831114292, "grad/layer_0/attn_mlp_ratio": 0.9197036095502489, "grad/layer_4/attn": 0.003823115024715662, "grad/layer_4/mlp": 0.0025995878968387842, "grad/layer_4/attn_mlp_ratio": 1.4706619007953343, "grad/layer_8/attn": 0.0034428879152983427, "grad/layer_8/mlp": 0.003439385211095214, "grad/layer_8/attn_mlp_ratio": 1.001018380869346, "grad/layer_12/attn": 0.005759603809565306, "grad/layer_12/mlp": 0.006812410429120064, "grad/layer_12/attn_mlp_ratio": 0.8454575344432824, "grad/layer_16/attn": 0.0048142378218472, "grad/layer_16/mlp": 0.004751916509121656, "grad/layer_16/attn_mlp_ratio": 1.013114963466723, "grad/layer_20/attn": 0.002837534062564373, "grad/layer_20/mlp": 0.006540085654705763, "grad/layer_20/attn_mlp_ratio": 0.43386802084708037, "grad/layer_24/attn": 0.011226437985897064, "grad/layer_24/mlp": 0.010369130410254002, "grad/layer_24/attn_mlp_ratio": 1.0826788200606863, "grad/layer_27/attn": 0.004513941239565611, "grad/layer_27/mlp": 0.011606795713305473, "grad/layer_27/attn_mlp_ratio": 0.38890502703519997} {"step": 1700, "timestamp": 1778327557.9121535, "train/loss": 2.4103877544403076, "train/z_loss": 0.001339751307386905, "train/perplexity": 11.138279225060758, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024953.4012890318, "perf/iters_per_sec": 0.9655730253644141, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356544494628905, "data/tokens_consumed": 3567255552, "data/tokens_consumed_B": 3.567255552, "train/loss_slope": -3.7924145021275524e-05} {"step": 1710, "timestamp": 1778327568.2596204, "train/loss": 2.4267361164093018, "train/z_loss": 0.0013464193092659117, "train/perplexity": 11.321868448123372, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027830.6022747657, "perf/iters_per_sec": 0.9669449817060307, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034185004234314, "data/tokens_consumed": 3588227072, "data/tokens_consumed_B": 3.588227072, "train/loss_slope": -3.62323407042395e-05} {"step": 1720, "timestamp": 1778327578.6127782, "train/loss": 2.456222677230835, "train/z_loss": 0.0013307620421983302, "train/perplexity": 11.660682086906565, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026644.2428412994, "perf/iters_per_sec": 0.9663792814451692, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347903966903687, "data/tokens_consumed": 3609198592, "data/tokens_consumed_B": 3.609198592, "train/loss_slope": -3.338996108644831e-05} {"step": 1725, "timestamp": 1778327584.372278, "eos/sharpness": 11.130905151367186, "eos/L0_probe": 2.4084768295288086, "eos/L_plus": 2.4677178859710693, "eos/L_minus": 2.4605448246002197, "eos/grad_norm": 0.12124574184417725, "eos/embed_grad_frac": 0.202167347073555, "eos/time_s": 0.5954794883728027} {"step": 1725, "timestamp": 1778327585.7528787, "geo/rankme_last": 423.24761962890625, "geo/layer_0/stable_rank_q_proj": 20.133892059326172, "geo/layer_0/stable_rank_k_proj": 17.006507873535156, "geo/layer_0/stable_rank_o_proj": 46.15428161621094, "geo/layer_0/stable_rank_gate_proj": 129.89878845214844, "geo/layer_0/stable_rank_down_proj": 55.68263244628906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06490539014339447, "geo/layer_0/attn_entropy_mean": 6.25703239440918, "geo/layer_0/attn_entropy_std": 0.4080713391304016, "geo/layer_7/stable_rank_q_proj": 42.706878662109375, "geo/layer_7/stable_rank_k_proj": 40.13029861450195, "geo/layer_7/stable_rank_o_proj": 89.30697631835938, "geo/layer_7/stable_rank_gate_proj": 79.12543487548828, "geo/layer_7/stable_rank_down_proj": 143.33609008789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4299015402793884, "geo/layer_7/attn_entropy_mean": 4.700146675109863, "geo/layer_7/attn_entropy_std": 0.7736154198646545, "geo/layer_14/stable_rank_q_proj": 50.8188362121582, "geo/layer_14/stable_rank_k_proj": 41.04155349731445, "geo/layer_14/stable_rank_o_proj": 42.97220993041992, "geo/layer_14/stable_rank_gate_proj": 71.69290161132812, "geo/layer_14/stable_rank_down_proj": 126.66923522949219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39128610491752625, "geo/layer_14/attn_entropy_mean": 5.53626823425293, "geo/layer_14/attn_entropy_std": 0.4032261073589325, "geo/layer_21/stable_rank_q_proj": 40.08066940307617, "geo/layer_21/stable_rank_k_proj": 29.80875587463379, "geo/layer_21/stable_rank_o_proj": 67.41558837890625, "geo/layer_21/stable_rank_gate_proj": 63.583194732666016, "geo/layer_21/stable_rank_down_proj": 49.89845275878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451684832572937, "geo/layer_21/attn_entropy_mean": 5.884044647216797, "geo/layer_21/attn_entropy_std": 0.31702759861946106, "geo/layer_27/stable_rank_q_proj": 44.02388381958008, "geo/layer_27/stable_rank_k_proj": 31.312538146972656, "geo/layer_27/stable_rank_o_proj": 114.83377838134766, "geo/layer_27/stable_rank_gate_proj": 75.68281555175781, "geo/layer_27/stable_rank_down_proj": 127.66222381591797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10006999224424362, "geo/layer_27/attn_entropy_mean": 4.35567045211792, "geo/layer_27/attn_entropy_std": 0.6677613854408264, "attnres/final_alpha/block_0": 0.24739260971546173, "attnres/block_norm/0": 1.7824034690856934, "attnres/final_alpha/block_1": 0.004003660753369331, "attnres/block_norm/1": 49100.6640625, "attnres/final_alpha/block_2": 0.009039856493473053, "attnres/block_norm/2": 29598.208984375, "attnres/final_alpha/block_3": 0.010818177834153175, "attnres/block_norm/3": 66560.84375, "attnres/final_alpha/block_4": 0.01226906105875969, "attnres/block_norm/4": 16407.01171875, "attnres/final_alpha/block_5": 0.6126781105995178, "attnres/block_norm/5": 6867.41552734375, "attnres/final_alpha/block_6": 0.103798508644104, "attnres/block_norm/6": 44498.8515625, "geo/tier1_time_s": 1.3589308261871338, "geo/step": 1725.0, "geo/rankme_slope": 0.003239305579144022} {"step": 1730, "timestamp": 1778327590.9296737, "train/loss": 2.402897334098816, "train/z_loss": 0.0013476907508447767, "train/perplexity": 11.055160517437224, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703391.7455756096, "perf/iters_per_sec": 0.8122404792669342, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2311624765396119, "data/tokens_consumed": 3630170112, "data/tokens_consumed_B": 3.630170112, "train/loss_slope": -3.40144899204047e-05} {"step": 1740, "timestamp": 1778327601.2797308, "train/loss": 2.380105805397034, "train/z_loss": 0.0013574633980169893, "train/perplexity": 10.80604614145009, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027292.989601867, "perf/iters_per_sec": 0.9666886280068717, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344592571258544, "data/tokens_consumed": 3651141632, "data/tokens_consumed_B": 3.651141632, "train/loss_slope": -3.531068256228239e-05} {"step": 1750, "timestamp": 1778327611.6253793, "grad/layer_0/attn": 0.0025739220436662436, "grad/layer_0/mlp": 0.002953991759568453, "grad/layer_0/attn_mlp_ratio": 0.8713368776995433, "grad/layer_4/attn": 0.0019104023231193423, "grad/layer_4/mlp": 0.0025821623858064413, "grad/layer_4/attn_mlp_ratio": 0.739845898010055, "grad/layer_8/attn": 0.00488698435947299, "grad/layer_8/mlp": 0.003635923145338893, "grad/layer_8/attn_mlp_ratio": 1.3440834774875698, "grad/layer_12/attn": 0.007460570894181728, "grad/layer_12/mlp": 0.00639465032145381, "grad/layer_12/attn_mlp_ratio": 1.1666894048112149, "grad/layer_16/attn": 0.004891138058155775, "grad/layer_16/mlp": 0.004258864559233189, "grad/layer_16/attn_mlp_ratio": 1.1484605521689502, "grad/layer_20/attn": 0.0046262918040156364, "grad/layer_20/mlp": 0.006118529010564089, "grad/layer_20/attn_mlp_ratio": 0.7561117583028256, "grad/layer_24/attn": 0.01556701771914959, "grad/layer_24/mlp": 0.013171293772757053, "grad/layer_24/attn_mlp_ratio": 1.1818897877108148, "grad/layer_27/attn": 0.004879091866314411, "grad/layer_27/mlp": 0.014204524457454681, "grad/layer_27/attn_mlp_ratio": 0.34348857271353117} {"step": 1750, "timestamp": 1778327611.64115, "train/loss": 2.381764769554138, "train/z_loss": 0.0013554968987591564, "train/perplexity": 10.823987862900294, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025180.0763914897, "perf/iters_per_sec": 0.9656811124761056, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355385303497315, "data/tokens_consumed": 3672113152, "data/tokens_consumed_B": 3.672113152, "train/loss_slope": -3.887238226386113e-05} {"step": 1760, "timestamp": 1778327621.9955602, "train/loss": 2.3945274353027344, "train/z_loss": 0.0013460638234391808, "train/perplexity": 10.963016100386886, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026592.7868685364, "perf/iters_per_sec": 0.9663547453253443, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348166704177857, "data/tokens_consumed": 3693084672, "data/tokens_consumed_B": 3.693084672, "train/loss_slope": -3.777718443860527e-05} {"step": 1770, "timestamp": 1778327632.3437812, "train/loss": 2.3854965448379515, "train/z_loss": 0.0013544762157835066, "train/perplexity": 10.864456015341116, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027680.22147677, "perf/iters_per_sec": 0.9668732745536661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034261703491211, "data/tokens_consumed": 3714056192, "data/tokens_consumed_B": 3.714056192, "train/loss_slope": -4.158905713435874e-05} {"step": 1780, "timestamp": 1778327642.706003, "train/loss": 2.4111671924591063, "train/z_loss": 0.00135721875121817, "train/perplexity": 11.14696420761587, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025474.3355146635, "perf/iters_per_sec": 0.9658214261601751, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353880882263184, "data/tokens_consumed": 3735027712, "data/tokens_consumed_B": 3.735027712, "train/loss_slope": -4.038073878512881e-05} {"step": 1790, "timestamp": 1778327653.0762863, "train/loss": 2.387105178833008, "train/z_loss": 0.001357402722351253, "train/perplexity": 10.881947013160246, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023314.7576956686, "perf/iters_per_sec": 0.9647916592100471, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364932060241698, "data/tokens_consumed": 3755999232, "data/tokens_consumed_B": 3.755999232, "train/loss_slope": -4.2174482746164305e-05} {"step": 1800, "timestamp": 1778327663.4232934, "grad/layer_0/attn": 0.002795472973957658, "grad/layer_0/mlp": 0.0030956489499658346, "grad/layer_0/attn_mlp_ratio": 0.9030329113012698, "grad/layer_4/attn": 0.0019300533458590508, "grad/layer_4/mlp": 0.0027898107655346394, "grad/layer_4/attn_mlp_ratio": 0.6918222915047595, "grad/layer_8/attn": 0.0045323725789785385, "grad/layer_8/mlp": 0.0035351437982171774, "grad/layer_8/attn_mlp_ratio": 1.28208998260701, "grad/layer_12/attn": 0.007454250007867813, "grad/layer_12/mlp": 0.007286014035344124, "grad/layer_12/attn_mlp_ratio": 1.0230902478911734, "grad/layer_16/attn": 0.003704723669216037, "grad/layer_16/mlp": 0.004818928427994251, "grad/layer_16/attn_mlp_ratio": 0.7687857679761082, "grad/layer_20/attn": 0.004134605638682842, "grad/layer_20/mlp": 0.005951928440481424, "grad/layer_20/attn_mlp_ratio": 0.6946665455678359, "grad/layer_24/attn": 0.005791386589407921, "grad/layer_24/mlp": 0.007495180703699589, "grad/layer_24/attn_mlp_ratio": 0.7726813723491924, "grad/layer_27/attn": 0.006920775398612022, "grad/layer_27/mlp": 0.007594900205731392, "grad/layer_27/attn_mlp_ratio": 0.911239795127969} {"step": 1800, "timestamp": 1778327664.0224257, "eos/sharpness": 48.769092559814446, "eos/L0_probe": 2.40773606300354, "eos/L_plus": 2.727710485458374, "eos/L_minus": 2.5754525661468506, "eos/grad_norm": 0.13503088057041168, "eos/embed_grad_frac": 0.12673309445381165, "eos/time_s": 0.5962672233581543} {"step": 1800, "timestamp": 1778327664.0416381, "train/loss": 2.4368860721588135, "train/z_loss": 0.001359185460023582, "train/perplexity": 11.437370088547972, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913435.4140928683, "perf/iters_per_sec": 0.912397105261263, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0960139989852906, "data/tokens_consumed": 3776970752, "data/tokens_consumed_B": 3.776970752, "train/loss_slope": -3.964309612266156e-05} {"step": 1800, "timestamp": 1778327665.405952, "geo/rankme_last": 423.8494873046875, "geo/layer_0/stable_rank_q_proj": 20.166105270385742, "geo/layer_0/stable_rank_k_proj": 17.0304012298584, "geo/layer_0/stable_rank_o_proj": 46.11974334716797, "geo/layer_0/stable_rank_gate_proj": 129.9539031982422, "geo/layer_0/stable_rank_down_proj": 55.682518005371094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0687461867928505, "geo/layer_0/attn_entropy_mean": 6.258974075317383, "geo/layer_0/attn_entropy_std": 0.40921565890312195, "geo/layer_7/stable_rank_q_proj": 42.7098388671875, "geo/layer_7/stable_rank_k_proj": 40.14457321166992, "geo/layer_7/stable_rank_o_proj": 89.28400421142578, "geo/layer_7/stable_rank_gate_proj": 79.1237564086914, "geo/layer_7/stable_rank_down_proj": 143.5092315673828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.424618124961853, "geo/layer_7/attn_entropy_mean": 4.735288619995117, "geo/layer_7/attn_entropy_std": 0.7609041333198547, "geo/layer_14/stable_rank_q_proj": 50.84903335571289, "geo/layer_14/stable_rank_k_proj": 41.04952621459961, "geo/layer_14/stable_rank_o_proj": 42.98299026489258, "geo/layer_14/stable_rank_gate_proj": 71.67660522460938, "geo/layer_14/stable_rank_down_proj": 126.48248291015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3704162836074829, "geo/layer_14/attn_entropy_mean": 5.53337287902832, "geo/layer_14/attn_entropy_std": 0.40220907330513, "geo/layer_21/stable_rank_q_proj": 40.05400085449219, "geo/layer_21/stable_rank_k_proj": 29.775732040405273, "geo/layer_21/stable_rank_o_proj": 67.41497039794922, "geo/layer_21/stable_rank_gate_proj": 63.500125885009766, "geo/layer_21/stable_rank_down_proj": 49.915374755859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.138316348195076, "geo/layer_21/attn_entropy_mean": 5.884521484375, "geo/layer_21/attn_entropy_std": 0.316938579082489, "geo/layer_27/stable_rank_q_proj": 44.060028076171875, "geo/layer_27/stable_rank_k_proj": 31.315433502197266, "geo/layer_27/stable_rank_o_proj": 114.7409896850586, "geo/layer_27/stable_rank_gate_proj": 75.55626678466797, "geo/layer_27/stable_rank_down_proj": 127.59394836425781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09247706085443497, "geo/layer_27/attn_entropy_mean": 4.353711128234863, "geo/layer_27/attn_entropy_std": 0.6344072222709656, "attnres/final_alpha/block_0": 0.24798662960529327, "attnres/block_norm/0": 1.7822092771530151, "attnres/final_alpha/block_1": 0.004160846583545208, "attnres/block_norm/1": 48907.16796875, "attnres/final_alpha/block_2": 0.009052114561200142, "attnres/block_norm/2": 29496.396484375, "attnres/final_alpha/block_3": 0.010672313161194324, "attnres/block_norm/3": 66829.546875, "attnres/final_alpha/block_4": 0.012515557929873466, "attnres/block_norm/4": 16423.30859375, "attnres/final_alpha/block_5": 0.612265408039093, "attnres/block_norm/5": 6854.6943359375, "attnres/final_alpha/block_6": 0.10334710776805878, "attnres/block_norm/6": 44353.05859375, "geo/tier1_time_s": 1.3605186939239502, "geo/step": 1800.0, "geo/rankme_slope": 0.003223783991887019} {"step": 1810, "timestamp": 1778327675.7569716, "train/loss": 2.3775199174880983, "train/z_loss": 0.0013507270021364092, "train/perplexity": 10.778139015290385, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790647.9498179273, "perf/iters_per_sec": 0.8538474797334324, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171169352531433, "data/tokens_consumed": 3797942272, "data/tokens_consumed_B": 3.797942272, "train/loss_slope": -4.113953361774469e-05} {"step": 1820, "timestamp": 1778327686.1099849, "train/loss": 2.3947640657424927, "train/z_loss": 0.0013411149382591247, "train/perplexity": 10.96561059066338, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026881.104750048, "perf/iters_per_sec": 0.9664922260046234, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346694707870483, "data/tokens_consumed": 3818913792, "data/tokens_consumed_B": 3.818913792, "train/loss_slope": -3.879144249683451e-05} {"step": 1830, "timestamp": 1778327696.4538445, "train/loss": 2.3789103984832765, "train/z_loss": 0.0013615251169539987, "train/perplexity": 10.793136237014078, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028330.8946135978, "perf/iters_per_sec": 0.9671835396831502, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339299201965333, "data/tokens_consumed": 3839885312, "data/tokens_consumed_B": 3.839885312, "train/loss_slope": -3.959325404986939e-05} {"step": 1840, "timestamp": 1778327706.8047812, "train/loss": 2.3639297246932984, "train/z_loss": 0.0013587135588750243, "train/perplexity": 10.63265285974165, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027091.627676523, "perf/iters_per_sec": 0.9665926111586204, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345620155334472, "data/tokens_consumed": 3860856832, "data/tokens_consumed_B": 3.860856832, "train/loss_slope": -4.090146833878179e-05} {"step": 1850, "timestamp": 1778327717.1488655, "grad/layer_0/attn": 0.003325291909277439, "grad/layer_0/mlp": 0.0032834361772984266, "grad/layer_0/attn_mlp_ratio": 1.0127475085380524, "grad/layer_4/attn": 0.0018870458006858826, "grad/layer_4/mlp": 0.0024700413923710585, "grad/layer_4/attn_mlp_ratio": 0.7639733204944898, "grad/layer_8/attn": 0.005112094804644585, "grad/layer_8/mlp": 0.0035916678607463837, "grad/layer_8/attn_mlp_ratio": 1.4233205464745247, "grad/layer_12/attn": 0.00558746000751853, "grad/layer_12/mlp": 0.006690333131700754, "grad/layer_12/attn_mlp_ratio": 0.8351542163914215, "grad/layer_16/attn": 0.0035774726420640945, "grad/layer_16/mlp": 0.004566603805869818, "grad/layer_16/attn_mlp_ratio": 0.7833989362347993, "grad/layer_20/attn": 0.008356325328350067, "grad/layer_20/mlp": 0.006179953459650278, "grad/layer_20/attn_mlp_ratio": 1.3521663630143768, "grad/layer_24/attn": 0.011788634583353996, "grad/layer_24/mlp": 0.011826264671981335, "grad/layer_24/attn_mlp_ratio": 0.9968180833633549, "grad/layer_27/attn": 0.010127127170562744, "grad/layer_27/mlp": 0.009855929762125015, "grad/layer_27/attn_mlp_ratio": 1.027516156489699} {"step": 1850, "timestamp": 1778327717.1645885, "train/loss": 2.386001706123352, "train/z_loss": 0.0013551758136600256, "train/perplexity": 10.869945704379447, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025583.80680501, "perf/iters_per_sec": 0.9658736261391687, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353321313858033, "data/tokens_consumed": 3881828352, "data/tokens_consumed_B": 3.881828352, "train/loss_slope": -4.1634050590155744e-05} {"step": 1860, "timestamp": 1778327727.5166066, "train/loss": 2.4412073850631715, "train/z_loss": 0.001335070445202291, "train/perplexity": 11.486901486760631, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026960.5536834456, "perf/iters_per_sec": 0.966530110208247, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346289157867432, "data/tokens_consumed": 3902799872, "data/tokens_consumed_B": 3.902799872, "train/loss_slope": -3.7002133479033906e-05} {"step": 1870, "timestamp": 1778327737.8702838, "train/loss": 2.4026564598083495, "train/z_loss": 0.0013581693754531443, "train/perplexity": 11.052497934178394, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026700.2777255068, "perf/iters_per_sec": 0.9664060009601149, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347617864608765, "data/tokens_consumed": 3923771392, "data/tokens_consumed_B": 3.923771392, "train/loss_slope": -3.724199052404933e-05} {"step": 1875, "timestamp": 1778327743.6455333, "eos/sharpness": 13.147830963134764, "eos/L0_probe": 2.4038732051849365, "eos/L_plus": 2.482761859893799, "eos/L_minus": 2.456462860107422, "eos/grad_norm": 0.10765580087900162, "eos/embed_grad_frac": 0.210359588265419, "eos/time_s": 0.6043453216552734} {"step": 1875, "timestamp": 1778327745.026556, "geo/rankme_last": 422.7818603515625, "geo/layer_0/stable_rank_q_proj": 20.181743621826172, "geo/layer_0/stable_rank_k_proj": 17.031347274780273, "geo/layer_0/stable_rank_o_proj": 46.06125259399414, "geo/layer_0/stable_rank_gate_proj": 129.86312866210938, "geo/layer_0/stable_rank_down_proj": 55.67684555053711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06558871269226074, "geo/layer_0/attn_entropy_mean": 6.255622386932373, "geo/layer_0/attn_entropy_std": 0.4145672023296356, "geo/layer_7/stable_rank_q_proj": 42.70280075073242, "geo/layer_7/stable_rank_k_proj": 40.09232711791992, "geo/layer_7/stable_rank_o_proj": 89.15876770019531, "geo/layer_7/stable_rank_gate_proj": 79.11955261230469, "geo/layer_7/stable_rank_down_proj": 143.79623413085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42654770612716675, "geo/layer_7/attn_entropy_mean": 4.72951602935791, "geo/layer_7/attn_entropy_std": 0.7857164740562439, "geo/layer_14/stable_rank_q_proj": 50.889163970947266, "geo/layer_14/stable_rank_k_proj": 41.18793869018555, "geo/layer_14/stable_rank_o_proj": 43.00769805908203, "geo/layer_14/stable_rank_gate_proj": 71.61195373535156, "geo/layer_14/stable_rank_down_proj": 126.8095703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3828372359275818, "geo/layer_14/attn_entropy_mean": 5.546718597412109, "geo/layer_14/attn_entropy_std": 0.40385881066322327, "geo/layer_21/stable_rank_q_proj": 40.01093673706055, "geo/layer_21/stable_rank_k_proj": 29.821903228759766, "geo/layer_21/stable_rank_o_proj": 67.33228302001953, "geo/layer_21/stable_rank_gate_proj": 63.544097900390625, "geo/layer_21/stable_rank_down_proj": 49.97724533081055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1425103098154068, "geo/layer_21/attn_entropy_mean": 5.902373790740967, "geo/layer_21/attn_entropy_std": 0.3074610233306885, "geo/layer_27/stable_rank_q_proj": 43.97974395751953, "geo/layer_27/stable_rank_k_proj": 31.255876541137695, "geo/layer_27/stable_rank_o_proj": 114.45066833496094, "geo/layer_27/stable_rank_gate_proj": 75.5785140991211, "geo/layer_27/stable_rank_down_proj": 127.5311508178711, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10475826263427734, "geo/layer_27/attn_entropy_mean": 4.325721740722656, "geo/layer_27/attn_entropy_std": 0.6435617804527283, "attnres/final_alpha/block_0": 0.24975216388702393, "attnres/block_norm/0": 1.7820751667022705, "attnres/final_alpha/block_1": 0.00417828606441617, "attnres/block_norm/1": 49106.9140625, "attnres/final_alpha/block_2": 0.009012926369905472, "attnres/block_norm/2": 29516.41796875, "attnres/final_alpha/block_3": 0.010787256062030792, "attnres/block_norm/3": 66998.828125, "attnres/final_alpha/block_4": 0.01239343173801899, "attnres/block_norm/4": 16378.884765625, "attnres/final_alpha/block_5": 0.6085864305496216, "attnres/block_norm/5": 6946.1826171875, "attnres/final_alpha/block_6": 0.10528954863548279, "attnres/block_norm/6": 44464.828125, "geo/tier1_time_s": 1.360034465789795, "geo/step": 1875.0, "geo/rankme_slope": 0.0030623583845263533} {"step": 1880, "timestamp": 1778327750.2067, "train/loss": 2.3970810413360595, "train/z_loss": 0.0013416030793450772, "train/perplexity": 10.99104709927593, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700728.7402905435, "perf/iters_per_sec": 0.8109706593945234, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2330902338027954, "data/tokens_consumed": 3944742912, "data/tokens_consumed_B": 3.944742912, "train/loss_slope": -3.737415594987383e-05} {"step": 1890, "timestamp": 1778327760.554966, "train/loss": 2.3978540182113646, "train/z_loss": 0.0013552095275372266, "train/perplexity": 10.99954620890348, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027991.5255492528, "perf/iters_per_sec": 0.9670217159029258, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341029405593871, "data/tokens_consumed": 3965714432, "data/tokens_consumed_B": 3.965714432, "train/loss_slope": -3.658291646177405e-05} {"step": 1900, "timestamp": 1778327770.9034102, "grad/layer_0/attn": 0.0036043040454387665, "grad/layer_0/mlp": 0.003589126979932189, "grad/layer_0/attn_mlp_ratio": 1.0042285951900216, "grad/layer_4/attn": 0.0018895982066169381, "grad/layer_4/mlp": 0.002539641223847866, "grad/layer_4/attn_mlp_ratio": 0.7440413687055487, "grad/layer_8/attn": 0.003586952807381749, "grad/layer_8/mlp": 0.0036055229138582945, "grad/layer_8/attn_mlp_ratio": 0.9948495110403766, "grad/layer_12/attn": 0.010181164368987083, "grad/layer_12/mlp": 0.007080249022692442, "grad/layer_12/attn_mlp_ratio": 1.4379669687548284, "grad/layer_16/attn": 0.003545706160366535, "grad/layer_16/mlp": 0.0044926428236067295, "grad/layer_16/attn_mlp_ratio": 0.7892250109029396, "grad/layer_20/attn": 0.003277519252151251, "grad/layer_20/mlp": 0.005901705473661423, "grad/layer_20/attn_mlp_ratio": 0.555351196572464, "grad/layer_24/attn": 0.005372193641960621, "grad/layer_24/mlp": 0.008515042252838612, "grad/layer_24/attn_mlp_ratio": 0.6309062737861454, "grad/layer_27/attn": 0.010468753054738045, "grad/layer_27/mlp": 0.006810659077018499, "grad/layer_27/attn_mlp_ratio": 1.5371130433399471} {"step": 1900, "timestamp": 1778327770.9190624, "train/loss": 2.441360282897949, "train/z_loss": 0.0013422021991573274, "train/perplexity": 11.488657943402046, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024437.3476312275, "perf/iters_per_sec": 0.9653269518047464, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359184503555299, "data/tokens_consumed": 3986685952, "data/tokens_consumed_B": 3.986685952, "train/loss_slope": -3.2398709898913475e-05} {"step": 1910, "timestamp": 1778327781.2709851, "train/loss": 2.4076749324798583, "train/z_loss": 0.0013521705521270632, "train/perplexity": 11.10810400508307, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027124.8893477141, "perf/iters_per_sec": 0.9666084715593882, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345450401306153, "data/tokens_consumed": 4007657472, "data/tokens_consumed_B": 4.007657472, "train/loss_slope": -3.2638970045152096e-05} {"step": 1920, "timestamp": 1778327791.6149497, "train/loss": 2.423146629333496, "train/z_loss": 0.0013489637523889542, "train/perplexity": 11.281301598303104, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028289.782683131, "perf/iters_per_sec": 0.9671639359870582, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339508771896362, "data/tokens_consumed": 4028628992, "data/tokens_consumed_B": 4.028628992, "train/loss_slope": -3.1064414248393535e-05} {"step": 1930, "timestamp": 1778327801.9629672, "train/loss": 2.3834667682647703, "train/z_loss": 0.0013589461683295666, "train/perplexity": 10.842425962646889, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027732.0131532855, "perf/iters_per_sec": 0.9668979707495143, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342352867126465, "data/tokens_consumed": 4049600512, "data/tokens_consumed_B": 4.049600512, "train/loss_slope": -3.2984036538037984e-05} {"step": 1940, "timestamp": 1778327812.3082526, "train/loss": 2.4206624269485473, "train/z_loss": 0.001361796201672405, "train/perplexity": 11.253311343091248, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028252.6945102818, "perf/iters_per_sec": 0.9671462509681138, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033969783782959, "data/tokens_consumed": 4070572032, "data/tokens_consumed_B": 4.070572032, "train/loss_slope": -3.348723861357849e-05} {"step": 1950, "timestamp": 1778327822.6390913, "grad/layer_0/attn": 0.0032748475205153227, "grad/layer_0/mlp": 0.00356985442340374, "grad/layer_0/attn_mlp_ratio": 0.9173616176921581, "grad/layer_4/attn": 0.0023492653854191303, "grad/layer_4/mlp": 0.0026003376115113497, "grad/layer_4/attn_mlp_ratio": 0.9034462620063711, "grad/layer_8/attn": 0.007925033569335938, "grad/layer_8/mlp": 0.003510024631395936, "grad/layer_8/attn_mlp_ratio": 2.257828413130341, "grad/layer_12/attn": 0.00548147689551115, "grad/layer_12/mlp": 0.0071311830542981625, "grad/layer_12/attn_mlp_ratio": 0.7686630362603031, "grad/layer_16/attn": 0.005539343226701021, "grad/layer_16/mlp": 0.004584667272865772, "grad/layer_16/attn_mlp_ratio": 1.2082323048091717, "grad/layer_20/attn": 0.003205900778993964, "grad/layer_20/mlp": 0.005913004279136658, "grad/layer_20/attn_mlp_ratio": 0.5421779815191088, "grad/layer_24/attn": 0.008894231170415878, "grad/layer_24/mlp": 0.008782049641013145, "grad/layer_24/attn_mlp_ratio": 1.012773946027524, "grad/layer_27/attn": 0.004327914211899042, "grad/layer_27/mlp": 0.008592137135565281, "grad/layer_27/attn_mlp_ratio": 0.5037063646963859} {"step": 1950, "timestamp": 1778327823.2348692, "eos/sharpness": 32.10053443908691, "eos/L0_probe": 2.4047160148620605, "eos/L_plus": 2.55393648147583, "eos/L_minus": 2.57650089263916, "eos/grad_norm": 0.1154395118355751, "eos/embed_grad_frac": 0.15586024522781372, "eos/time_s": 0.5929379463195801} {"step": 1950, "timestamp": 1778327823.2539139, "train/loss": 2.379353976249695, "train/z_loss": 0.0013576662982814013, "train/perplexity": 10.797924894271171, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916613.4709369973, "perf/iters_per_sec": 0.9139125208554255, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.094196629524231, "data/tokens_consumed": 4091543552, "data/tokens_consumed_B": 4.091543552, "train/loss_slope": -3.5136463418222955e-05} {"step": 1950, "timestamp": 1778327824.6169045, "geo/rankme_last": 424.20208740234375, "geo/layer_0/stable_rank_q_proj": 20.204748153686523, "geo/layer_0/stable_rank_k_proj": 17.029979705810547, "geo/layer_0/stable_rank_o_proj": 46.093631744384766, "geo/layer_0/stable_rank_gate_proj": 129.67127990722656, "geo/layer_0/stable_rank_down_proj": 55.568389892578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06886796653270721, "geo/layer_0/attn_entropy_mean": 6.255372524261475, "geo/layer_0/attn_entropy_std": 0.4168509244918823, "geo/layer_7/stable_rank_q_proj": 42.678260803222656, "geo/layer_7/stable_rank_k_proj": 40.14728546142578, "geo/layer_7/stable_rank_o_proj": 89.1849594116211, "geo/layer_7/stable_rank_gate_proj": 79.23014831542969, "geo/layer_7/stable_rank_down_proj": 143.8372344970703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42001575231552124, "geo/layer_7/attn_entropy_mean": 4.72866678237915, "geo/layer_7/attn_entropy_std": 0.7626897096633911, "geo/layer_14/stable_rank_q_proj": 50.89195251464844, "geo/layer_14/stable_rank_k_proj": 41.18486785888672, "geo/layer_14/stable_rank_o_proj": 42.985469818115234, "geo/layer_14/stable_rank_gate_proj": 71.52799224853516, "geo/layer_14/stable_rank_down_proj": 126.89503479003906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38268548250198364, "geo/layer_14/attn_entropy_mean": 5.539560317993164, "geo/layer_14/attn_entropy_std": 0.4112037718296051, "geo/layer_21/stable_rank_q_proj": 40.017330169677734, "geo/layer_21/stable_rank_k_proj": 29.76900863647461, "geo/layer_21/stable_rank_o_proj": 67.28145599365234, "geo/layer_21/stable_rank_gate_proj": 63.51762008666992, "geo/layer_21/stable_rank_down_proj": 49.92155838012695, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13868597149848938, "geo/layer_21/attn_entropy_mean": 5.890147686004639, "geo/layer_21/attn_entropy_std": 0.3095189034938812, "geo/layer_27/stable_rank_q_proj": 44.0253791809082, "geo/layer_27/stable_rank_k_proj": 31.21944808959961, "geo/layer_27/stable_rank_o_proj": 114.15077209472656, "geo/layer_27/stable_rank_gate_proj": 75.57551574707031, "geo/layer_27/stable_rank_down_proj": 127.6396713256836, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1006193533539772, "geo/layer_27/attn_entropy_mean": 4.340872764587402, "geo/layer_27/attn_entropy_std": 0.6509530544281006, "attnres/final_alpha/block_0": 0.24944224953651428, "attnres/block_norm/0": 1.7819066047668457, "attnres/final_alpha/block_1": 0.004197374917566776, "attnres/block_norm/1": 49090.22265625, "attnres/final_alpha/block_2": 0.009115029126405716, "attnres/block_norm/2": 29633.984375, "attnres/final_alpha/block_3": 0.010812661610543728, "attnres/block_norm/3": 66522.328125, "attnres/final_alpha/block_4": 0.01262123603373766, "attnres/block_norm/4": 16441.5078125, "attnres/final_alpha/block_5": 0.6093326807022095, "attnres/block_norm/5": 6917.1904296875, "attnres/final_alpha/block_6": 0.10447876900434494, "attnres/block_norm/6": 44502.703125, "geo/tier1_time_s": 1.359032154083252, "geo/step": 1950.0, "geo/rankme_slope": 0.003060226968308023} {"step": 1960, "timestamp": 1778327834.9628851, "train/loss": 2.4371731758117674, "train/z_loss": 0.0013458178145810962, "train/perplexity": 11.440654270708368, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791632.9714924796, "perf/iters_per_sec": 0.8543171746694944, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1705254554748534, "data/tokens_consumed": 4112515072, "data/tokens_consumed_B": 4.112515072, "train/loss_slope": -3.342838257309775e-05} {"step": 1970, "timestamp": 1778327845.3129525, "train/loss": 2.406598138809204, "train/z_loss": 0.0013502465095371007, "train/perplexity": 11.096149306524406, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027192.4905176861, "perf/iters_per_sec": 0.9666407063091689, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345105409622193, "data/tokens_consumed": 4133486592, "data/tokens_consumed_B": 4.133486592, "train/loss_slope": -3.371912970258212e-05} {"step": 1980, "timestamp": 1778327855.657554, "train/loss": 2.3922118186950683, "train/z_loss": 0.0013627664418891072, "train/perplexity": 10.937659327846944, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028196.901113004, "perf/iters_per_sec": 0.9671196466031093, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033998227119446, "data/tokens_consumed": 4154458112, "data/tokens_consumed_B": 4.154458112, "train/loss_slope": -3.3118817386346844e-05} {"step": 1990, "timestamp": 1778327866.0060327, "train/loss": 2.420259404182434, "train/z_loss": 0.001338992326054722, "train/perplexity": 11.248776916225825, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027819.569540139, "perf/iters_per_sec": 0.9669397208882041, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341906309127809, "data/tokens_consumed": 4175429632, "data/tokens_consumed_B": 4.175429632, "train/loss_slope": -3.0610116995242995e-05} {"step": 2000, "timestamp": 1778327876.3413677, "grad/layer_0/attn": 0.003203825792297721, "grad/layer_0/mlp": 0.0033479698467999697, "grad/layer_0/attn_mlp_ratio": 0.9569458039370923, "grad/layer_4/attn": 0.002776334062218666, "grad/layer_4/mlp": 0.0027191615663468838, "grad/layer_4/attn_mlp_ratio": 1.021025743551538, "grad/layer_8/attn": 0.004310965538024902, "grad/layer_8/mlp": 0.0036602781619876623, "grad/layer_8/attn_mlp_ratio": 1.1777698932878107, "grad/layer_12/attn": 0.00595048675313592, "grad/layer_12/mlp": 0.007883463986217976, "grad/layer_12/attn_mlp_ratio": 0.7548060964137169, "grad/layer_16/attn": 0.003913832362741232, "grad/layer_16/mlp": 0.005696757696568966, "grad/layer_16/attn_mlp_ratio": 0.687028043407155, "grad/layer_20/attn": 0.003094421699643135, "grad/layer_20/mlp": 0.00709494948387146, "grad/layer_20/attn_mlp_ratio": 0.4361442830654506, "grad/layer_24/attn": 0.017636116594076157, "grad/layer_24/mlp": 0.013018464669585228, "grad/layer_24/attn_mlp_ratio": 1.3547001821043487, "grad/layer_27/attn": 0.009034913964569569, "grad/layer_27/mlp": 0.013403397984802723, "grad/layer_27/attn_mlp_ratio": 0.6740763728277006} {"step": 2000, "timestamp": 1778327876.3567038, "train/loss": 2.4244613885879516, "train/z_loss": 0.001363169914111495, "train/perplexity": 11.296143548639584, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027308.2685778907, "perf/iters_per_sec": 0.9666959135903791, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034451460838318, "data/tokens_consumed": 4196401152, "data/tokens_consumed_B": 4.196401152, "train/loss_slope": -2.8669079352240734e-05} {"step": 2000, "timestamp": 1778327883.3938096, "geo/ww_alpha_mean": 7.5817157854095525, "geo/ww_alpha_std": 4.400391257504419, "geo/ww_alpha_min": 1.3372156792428926, "geo/ww_alpha_max": 30.582601068392584, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9132366774708816, "geo/ww_alpha_by_type/k_proj": 4.577821766356829, "geo/ww_alpha_by_type/v_proj": 8.700258534585455, "geo/ww_alpha_by_type/o_proj": 8.54001703127719, "geo/ww_alpha_by_type/gate_proj": 7.8497703359983095, "geo/ww_alpha_by_type/up_proj": 11.741973108669153, "geo/ww_alpha_by_type/down_proj": 7.842885384188762, "geo/twonn_id/layer_0": 0.7092325687408447, "geo/twonn_id/layer_7": 3.162158966064453, "geo/twonn_id/layer_14": 4.6528778076171875, "geo/twonn_id/layer_21": 8.064129829406738, "geo/twonn_id/layer_27": 6.4718194007873535, "geo/tier2_time_s": 7.026087999343872} {"step": 2000, "timestamp": 1778327884.15228, "eoc/jacobian_sigma/layer_0/attn": 1455.2691650390625, "eoc/jacobian_sigma/layer_0/mlp": 10134.91796875, "eoc/jacobian_sigma/layer_0": 10134.91796875, "eoc/jacobian_sigma/layer_7/attn": 1.0646641254425049, "eoc/jacobian_sigma/layer_7/mlp": 1.75235116481781, "eoc/jacobian_sigma/layer_7": 1.75235116481781, "eoc/jacobian_sigma/layer_14/attn": 1.8678455352783203, "eoc/jacobian_sigma/layer_14/mlp": 13.174175262451172, "eoc/jacobian_sigma/layer_14": 13.174175262451172, "eoc/jacobian_sigma/layer_21/attn": 1.026452898979187, "eoc/jacobian_sigma/layer_21/mlp": 5.112973690032959, "eoc/jacobian_sigma/layer_21": 5.112973690032959, "eoc/jacobian_sigma/layer_27/attn": 3.4413700103759766, "eoc/jacobian_sigma/layer_27/mlp": 27.2725830078125, "eoc/jacobian_sigma/layer_27": 27.2725830078125, "eoc/layer0_sigma": 10134.91796875, "eoc/sigma_max": 27.2725830078125, "eoc/sigma_min": 1.75235116481781, "eoc/sigma_mean": 11.82802078127861, "eoc/time_s": 0.7511148452758789} {"step": 2010, "timestamp": 1778327894.521069, "train/loss": 2.4123896598815917, "train/z_loss": 0.0013551411568187178, "train/perplexity": 11.160599340774276, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1155175.279212079, "perf/iters_per_sec": 0.5508304973659892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8154405117034913, "data/tokens_consumed": 4217372672, "data/tokens_consumed_B": 4.217372672, "train/loss_slope": -2.735127617280628e-05} {"step": 2020, "timestamp": 1778327904.8772376, "train/loss": 2.40696005821228, "train/z_loss": 0.0013519863947294652, "train/perplexity": 11.100165945063734, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026065.8170996027, "perf/iters_per_sec": 0.966103466558267, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350858211517333, "data/tokens_consumed": 4238344192, "data/tokens_consumed_B": 4.238344192, "train/loss_slope": -2.638735982916081e-05} {"step": 2025, "timestamp": 1778327910.6452157, "eos/sharpness": 9.381604194641112, "eos/L0_probe": 2.4041993618011475, "eos/L_plus": 2.4656426906585693, "eos/L_minus": 2.4365720748901367, "eos/grad_norm": 0.098775215446949, "eos/embed_grad_frac": 0.2599759101867676, "eos/time_s": 0.6022636890411377} {"step": 2025, "timestamp": 1778327912.0245662, "geo/rankme_last": 422.3829040527344, "geo/layer_0/stable_rank_q_proj": 20.24637222290039, "geo/layer_0/stable_rank_k_proj": 17.019296646118164, "geo/layer_0/stable_rank_o_proj": 46.077972412109375, "geo/layer_0/stable_rank_gate_proj": 129.629150390625, "geo/layer_0/stable_rank_down_proj": 55.629920959472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06381548941135406, "geo/layer_0/attn_entropy_mean": 6.255888938903809, "geo/layer_0/attn_entropy_std": 0.4136907160282135, "geo/layer_7/stable_rank_q_proj": 42.592105865478516, "geo/layer_7/stable_rank_k_proj": 40.069786071777344, "geo/layer_7/stable_rank_o_proj": 88.97468566894531, "geo/layer_7/stable_rank_gate_proj": 79.15750122070312, "geo/layer_7/stable_rank_down_proj": 143.7014923095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4255552887916565, "geo/layer_7/attn_entropy_mean": 4.719786643981934, "geo/layer_7/attn_entropy_std": 0.7689008712768555, "geo/layer_14/stable_rank_q_proj": 50.92924880981445, "geo/layer_14/stable_rank_k_proj": 41.25763702392578, "geo/layer_14/stable_rank_o_proj": 42.958404541015625, "geo/layer_14/stable_rank_gate_proj": 71.38088989257812, "geo/layer_14/stable_rank_down_proj": 126.97445678710938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3755838871002197, "geo/layer_14/attn_entropy_mean": 5.543179512023926, "geo/layer_14/attn_entropy_std": 0.4169371426105499, "geo/layer_21/stable_rank_q_proj": 39.972450256347656, "geo/layer_21/stable_rank_k_proj": 29.812030792236328, "geo/layer_21/stable_rank_o_proj": 67.25137329101562, "geo/layer_21/stable_rank_gate_proj": 63.559871673583984, "geo/layer_21/stable_rank_down_proj": 49.94200897216797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14167442917823792, "geo/layer_21/attn_entropy_mean": 5.884969711303711, "geo/layer_21/attn_entropy_std": 0.3189127743244171, "geo/layer_27/stable_rank_q_proj": 44.011985778808594, "geo/layer_27/stable_rank_k_proj": 31.20071029663086, "geo/layer_27/stable_rank_o_proj": 113.97247314453125, "geo/layer_27/stable_rank_gate_proj": 75.43964385986328, "geo/layer_27/stable_rank_down_proj": 127.56208038330078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09791654348373413, "geo/layer_27/attn_entropy_mean": 4.329069137573242, "geo/layer_27/attn_entropy_std": 0.6324917078018188, "attnres/final_alpha/block_0": 0.24743816256523132, "attnres/block_norm/0": 1.7817966938018799, "attnres/final_alpha/block_1": 0.004076998680830002, "attnres/block_norm/1": 49245.2421875, "attnres/final_alpha/block_2": 0.009024986997246742, "attnres/block_norm/2": 29701.79296875, "attnres/final_alpha/block_3": 0.01065924670547247, "attnres/block_norm/3": 67364.53125, "attnres/final_alpha/block_4": 0.012370672076940536, "attnres/block_norm/4": 16375.5146484375, "attnres/final_alpha/block_5": 0.6132162809371948, "attnres/block_norm/5": 6832.96484375, "attnres/final_alpha/block_6": 0.10321363806724548, "attnres/block_norm/6": 44815.86328125, "geo/tier1_time_s": 1.3579223155975342, "geo/step": 2025.0, "geo/rankme_slope": 0.00285667838416946} {"step": 2030, "timestamp": 1778327917.2212584, "train/loss": 2.400757384300232, "train/z_loss": 0.0013468764605931937, "train/perplexity": 11.031528323787052, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700002.4858520161, "perf/iters_per_sec": 0.8106243542919236, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2336170196533203, "data/tokens_consumed": 4259315712, "data/tokens_consumed_B": 4.259315712, "train/loss_slope": -2.9908864018153918e-05} {"step": 2040, "timestamp": 1778327927.5672348, "train/loss": 2.4320295333862303, "train/z_loss": 0.0013450160156935453, "train/perplexity": 11.381958719897561, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028128.9991245428, "perf/iters_per_sec": 0.9670872684118952, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340328454971313, "data/tokens_consumed": 4280287232, "data/tokens_consumed_B": 4.280287232, "train/loss_slope": -2.6078752079347736e-05} {"step": 2050, "timestamp": 1778327937.9012814, "grad/layer_0/attn": 0.002999846590682864, "grad/layer_0/mlp": 0.003464790992438793, "grad/layer_0/attn_mlp_ratio": 0.8658087921171996, "grad/layer_4/attn": 0.002130151493474841, "grad/layer_4/mlp": 0.002657852368429303, "grad/layer_4/attn_mlp_ratio": 0.8014558816854441, "grad/layer_8/attn": 0.008028322830796242, "grad/layer_8/mlp": 0.0037015529815107584, "grad/layer_8/attn_mlp_ratio": 2.1689065789432225, "grad/layer_12/attn": 0.005837622564285994, "grad/layer_12/mlp": 0.00800042413175106, "grad/layer_12/attn_mlp_ratio": 0.7296641271994533, "grad/layer_16/attn": 0.004391941241919994, "grad/layer_16/mlp": 0.004891968797892332, "grad/layer_16/attn_mlp_ratio": 0.8977860108252588, "grad/layer_20/attn": 0.007836267352104187, "grad/layer_20/mlp": 0.007350534666329622, "grad/layer_20/attn_mlp_ratio": 1.0660812581962806, "grad/layer_24/attn": 0.020682265982031822, "grad/layer_24/mlp": 0.015226075425744057, "grad/layer_24/attn_mlp_ratio": 1.3583451590702087, "grad/layer_27/attn": 0.009271400980651379, "grad/layer_27/mlp": 0.01618213579058647, "grad/layer_27/attn_mlp_ratio": 0.5729404970603893} {"step": 2050, "timestamp": 1778327937.917064, "train/loss": 2.351900506019592, "train/z_loss": 0.0013617740012705326, "train/perplexity": 10.50551656164622, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027352.0976218255, "perf/iters_per_sec": 0.9667168129071357, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034429097175598, "data/tokens_consumed": 4301258752, "data/tokens_consumed_B": 4.301258752, "train/loss_slope": -3.0405191755709785e-05} {"step": 2060, "timestamp": 1778327948.2674906, "train/loss": 2.3970950841903687, "train/z_loss": 0.001367810764349997, "train/perplexity": 10.991201446032782, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027040.1022708807, "perf/iters_per_sec": 0.9665680419306186, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345883131027223, "data/tokens_consumed": 4322230272, "data/tokens_consumed_B": 4.322230272, "train/loss_slope": -2.9906001693309062e-05} {"step": 2070, "timestamp": 1778327958.6157858, "train/loss": 2.3848798990249636, "train/z_loss": 0.0013510182267054915, "train/perplexity": 10.857758559220228, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027640.3511671822, "perf/iters_per_sec": 0.9668542629085456, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342820405960083, "data/tokens_consumed": 4343201792, "data/tokens_consumed_B": 4.343201792, "train/loss_slope": -2.8720910421120308e-05} {"step": 2080, "timestamp": 1778327968.9702444, "train/loss": 2.39858615398407, "train/z_loss": 0.0013605756452307106, "train/perplexity": 11.007602318889843, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026451.4131291974, "perf/iters_per_sec": 0.9662873330732333, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348888635635376, "data/tokens_consumed": 4364173312, "data/tokens_consumed_B": 4.364173312, "train/loss_slope": -2.7894137079494756e-05} {"step": 2090, "timestamp": 1778327979.331576, "train/loss": 2.398626112937927, "train/z_loss": 0.0013549187104217708, "train/perplexity": 11.008042179951119, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025184.2262030418, "perf/iters_per_sec": 0.9656830912604532, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355364084243774, "data/tokens_consumed": 4385144832, "data/tokens_consumed_B": 4.385144832, "train/loss_slope": -2.4932395635765734e-05} {"step": 2100, "timestamp": 1778327989.6965916, "grad/layer_0/attn": 0.0030895713716745377, "grad/layer_0/mlp": 0.0033568362705409527, "grad/layer_0/attn_mlp_ratio": 0.920381880626685, "grad/layer_4/attn": 0.002010201569646597, "grad/layer_4/mlp": 0.0028432002291083336, "grad/layer_4/attn_mlp_ratio": 0.7070207290940429, "grad/layer_8/attn": 0.006385547574609518, "grad/layer_8/mlp": 0.0034974836744368076, "grad/layer_8/attn_mlp_ratio": 1.8257547386728818, "grad/layer_12/attn": 0.008555273525416851, "grad/layer_12/mlp": 0.007993066683411598, "grad/layer_12/attn_mlp_ratio": 1.07033680028447, "grad/layer_16/attn": 0.0038299558218568563, "grad/layer_16/mlp": 0.005457953084260225, "grad/layer_16/attn_mlp_ratio": 0.7017201673517051, "grad/layer_20/attn": 0.0029366095550358295, "grad/layer_20/mlp": 0.0076080490835011005, "grad/layer_20/attn_mlp_ratio": 0.38598719385308305, "grad/layer_24/attn": 0.013333260081708431, "grad/layer_24/mlp": 0.011387336067855358, "grad/layer_24/attn_mlp_ratio": 1.170884909795331, "grad/layer_27/attn": 0.00602448545396328, "grad/layer_27/mlp": 0.011694452725350857, "grad/layer_27/attn_mlp_ratio": 0.5151575318601992} {"step": 2100, "timestamp": 1778327990.302043, "eos/sharpness": 53.709936141967766, "eos/L0_probe": 2.3994228839874268, "eos/L_plus": 2.703697919845581, "eos/L_minus": 2.63224720954895, "eos/grad_norm": 0.197483628988266, "eos/embed_grad_frac": 0.06270212680101395, "eos/time_s": 0.6021347045898438} {"step": 2100, "timestamp": 1778327990.3238804, "train/loss": 2.4339257955551146, "train/z_loss": 0.0013663404737599195, "train/perplexity": 11.40356237424858, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908829.2427741888, "perf/iters_per_sec": 0.9102007116194671, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0986587762832642, "data/tokens_consumed": 4406116352, "data/tokens_consumed_B": 4.406116352, "train/loss_slope": -2.163632188585839e-05} {"step": 2100, "timestamp": 1778327991.6918013, "geo/rankme_last": 423.8571472167969, "geo/layer_0/stable_rank_q_proj": 20.248926162719727, "geo/layer_0/stable_rank_k_proj": 17.033035278320312, "geo/layer_0/stable_rank_o_proj": 46.13290023803711, "geo/layer_0/stable_rank_gate_proj": 129.62277221679688, "geo/layer_0/stable_rank_down_proj": 55.62929916381836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06436986476182938, "geo/layer_0/attn_entropy_mean": 6.256261825561523, "geo/layer_0/attn_entropy_std": 0.41119733452796936, "geo/layer_7/stable_rank_q_proj": 42.50501251220703, "geo/layer_7/stable_rank_k_proj": 39.930294036865234, "geo/layer_7/stable_rank_o_proj": 88.96321105957031, "geo/layer_7/stable_rank_gate_proj": 79.15579223632812, "geo/layer_7/stable_rank_down_proj": 143.40008544921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42494308948516846, "geo/layer_7/attn_entropy_mean": 4.718661308288574, "geo/layer_7/attn_entropy_std": 0.7699006795883179, "geo/layer_14/stable_rank_q_proj": 50.93231964111328, "geo/layer_14/stable_rank_k_proj": 41.197044372558594, "geo/layer_14/stable_rank_o_proj": 42.99956512451172, "geo/layer_14/stable_rank_gate_proj": 71.44893646240234, "geo/layer_14/stable_rank_down_proj": 127.26315307617188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37671709060668945, "geo/layer_14/attn_entropy_mean": 5.553066730499268, "geo/layer_14/attn_entropy_std": 0.42381763458251953, "geo/layer_21/stable_rank_q_proj": 39.96902084350586, "geo/layer_21/stable_rank_k_proj": 29.782793045043945, "geo/layer_21/stable_rank_o_proj": 67.25957489013672, "geo/layer_21/stable_rank_gate_proj": 63.51705551147461, "geo/layer_21/stable_rank_down_proj": 50.001853942871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14096686244010925, "geo/layer_21/attn_entropy_mean": 5.8981218338012695, "geo/layer_21/attn_entropy_std": 0.3147910535335541, "geo/layer_27/stable_rank_q_proj": 43.96261215209961, "geo/layer_27/stable_rank_k_proj": 31.174131393432617, "geo/layer_27/stable_rank_o_proj": 113.86868286132812, "geo/layer_27/stable_rank_gate_proj": 75.32319641113281, "geo/layer_27/stable_rank_down_proj": 127.66040802001953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10464062541723251, "geo/layer_27/attn_entropy_mean": 4.354761123657227, "geo/layer_27/attn_entropy_std": 0.6503329277038574, "attnres/final_alpha/block_0": 0.24837759137153625, "attnres/block_norm/0": 1.7813434600830078, "attnres/final_alpha/block_1": 0.004000340588390827, "attnres/block_norm/1": 49184.86328125, "attnres/final_alpha/block_2": 0.008940663188695908, "attnres/block_norm/2": 29466.06640625, "attnres/final_alpha/block_3": 0.010654350742697716, "attnres/block_norm/3": 66673.0390625, "attnres/final_alpha/block_4": 0.012305602431297302, "attnres/block_norm/4": 16407.9375, "attnres/final_alpha/block_5": 0.6140075922012329, "attnres/block_norm/5": 6842.09814453125, "attnres/final_alpha/block_6": 0.10171383619308472, "attnres/block_norm/6": 44675.85546875, "geo/tier1_time_s": 1.3638026714324951, "geo/step": 2100.0, "geo/rankme_slope": 0.002808297093082923} {"step": 2110, "timestamp": 1778328002.052526, "train/loss": 2.405314087867737, "train/z_loss": 0.0013513445621356368, "train/perplexity": 11.081910429239, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788677.1512368086, "perf/iters_per_sec": 0.8529077297386211, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1724597692489624, "data/tokens_consumed": 4427087872, "data/tokens_consumed_B": 4.427087872, "train/loss_slope": -2.265458257213363e-05} {"step": 2120, "timestamp": 1778328012.4108875, "train/loss": 2.4120164632797243, "train/z_loss": 0.0013559440732933581, "train/perplexity": 11.156435020128985, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026015.7437927034, "perf/iters_per_sec": 0.9660795897449033, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035111403465271, "data/tokens_consumed": 4448059392, "data/tokens_consumed_B": 4.448059392, "train/loss_slope": -2.1074967494498838e-05} {"step": 2130, "timestamp": 1778328023.170939, "train/loss": 2.371604347229004, "train/z_loss": 0.0013462434872053564, "train/perplexity": 10.714568390314655, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950031.2702714314, "perf/iters_per_sec": 0.9298473693234593, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0754453182220458, "data/tokens_consumed": 4469030912, "data/tokens_consumed_B": 4.469030912, "train/loss_slope": -2.3505621002201908e-05} {"step": 2140, "timestamp": 1778328033.523413, "train/loss": 2.4007326126098634, "train/z_loss": 0.001344254706054926, "train/perplexity": 11.031255057567769, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027352.6116197237, "perf/iters_per_sec": 0.9667170580004328, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344288349151611, "data/tokens_consumed": 4490002432, "data/tokens_consumed_B": 4.490002432, "train/loss_slope": -2.3485116746881357e-05} {"step": 2150, "timestamp": 1778328043.8657084, "grad/layer_0/attn": 0.0032087662257254124, "grad/layer_0/mlp": 0.003183892462402582, "grad/layer_0/attn_mlp_ratio": 1.0078123438009667, "grad/layer_4/attn": 0.0019001991022378206, "grad/layer_4/mlp": 0.0026715167332440615, "grad/layer_4/attn_mlp_ratio": 0.7112809766316843, "grad/layer_8/attn": 0.003229371504858136, "grad/layer_8/mlp": 0.003522151615470648, "grad/layer_8/attn_mlp_ratio": 0.9168746169205295, "grad/layer_12/attn": 0.006085844244807959, "grad/layer_12/mlp": 0.007571469992399216, "grad/layer_12/attn_mlp_ratio": 0.8037863414289077, "grad/layer_16/attn": 0.005358721129596233, "grad/layer_16/mlp": 0.004512133542448282, "grad/layer_16/attn_mlp_ratio": 1.1876246481672457, "grad/layer_20/attn": 0.00368501921184361, "grad/layer_20/mlp": 0.006049013696610928, "grad/layer_20/attn_mlp_ratio": 0.6091933885004875, "grad/layer_24/attn": 0.014049637131392956, "grad/layer_24/mlp": 0.011215333826839924, "grad/layer_24/attn_mlp_ratio": 1.2527167914073547, "grad/layer_27/attn": 0.006721964105963707, "grad/layer_27/mlp": 0.010791126638650894, "grad/layer_27/attn_mlp_ratio": 0.6229158704889872} {"step": 2150, "timestamp": 1778328043.880933, "train/loss": 2.3708128452301027, "train/z_loss": 0.0013616331620141865, "train/perplexity": 10.706091143337888, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025834.0913336582, "perf/iters_per_sec": 0.9659929711025516, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352042198181153, "data/tokens_consumed": 4510973952, "data/tokens_consumed_B": 4.510973952, "train/loss_slope": -2.6244657432357047e-05} {"step": 2160, "timestamp": 1778328054.231626, "train/loss": 2.4294723749160765, "train/z_loss": 0.0013542226632125675, "train/perplexity": 11.352890429702216, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027561.8304061121, "perf/iters_per_sec": 0.966816821291977, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343220949172973, "data/tokens_consumed": 4531945472, "data/tokens_consumed_B": 4.531945472, "train/loss_slope": -2.2690990841714227e-05} {"step": 2170, "timestamp": 1778328064.5877502, "train/loss": 2.361757493019104, "train/z_loss": 0.0013608809560537339, "train/perplexity": 10.609581341832081, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026452.440212049, "perf/iters_per_sec": 0.9662878228245015, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348883390426635, "data/tokens_consumed": 4552916992, "data/tokens_consumed_B": 4.552916992, "train/loss_slope": -2.291717343311789e-05} {"step": 2175, "timestamp": 1778328070.3467643, "eos/sharpness": 46.51300907135009, "eos/L0_probe": 2.3935914039611816, "eos/L_plus": 2.62949800491333, "eos/L_minus": 2.622814893722534, "eos/grad_norm": 0.17665942013263702, "eos/embed_grad_frac": 0.08560987561941147, "eos/time_s": 0.5962259769439697} {"step": 2175, "timestamp": 1778328071.7299423, "geo/rankme_last": 423.9319763183594, "geo/layer_0/stable_rank_q_proj": 20.236268997192383, "geo/layer_0/stable_rank_k_proj": 17.04160499572754, "geo/layer_0/stable_rank_o_proj": 46.09661102294922, "geo/layer_0/stable_rank_gate_proj": 129.53170776367188, "geo/layer_0/stable_rank_down_proj": 55.59700393676758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06487856060266495, "geo/layer_0/attn_entropy_mean": 6.257501125335693, "geo/layer_0/attn_entropy_std": 0.40882807970046997, "geo/layer_7/stable_rank_q_proj": 42.55122756958008, "geo/layer_7/stable_rank_k_proj": 39.86003494262695, "geo/layer_7/stable_rank_o_proj": 88.87899017333984, "geo/layer_7/stable_rank_gate_proj": 79.21056365966797, "geo/layer_7/stable_rank_down_proj": 143.56924438476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4413328170776367, "geo/layer_7/attn_entropy_mean": 4.733664512634277, "geo/layer_7/attn_entropy_std": 0.7747302055358887, "geo/layer_14/stable_rank_q_proj": 50.924896240234375, "geo/layer_14/stable_rank_k_proj": 41.29972457885742, "geo/layer_14/stable_rank_o_proj": 42.90127182006836, "geo/layer_14/stable_rank_gate_proj": 71.41710662841797, "geo/layer_14/stable_rank_down_proj": 127.27410125732422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3922382593154907, "geo/layer_14/attn_entropy_mean": 5.564845085144043, "geo/layer_14/attn_entropy_std": 0.41025063395500183, "geo/layer_21/stable_rank_q_proj": 39.96162796020508, "geo/layer_21/stable_rank_k_proj": 29.75632667541504, "geo/layer_21/stable_rank_o_proj": 67.27861022949219, "geo/layer_21/stable_rank_gate_proj": 63.4620475769043, "geo/layer_21/stable_rank_down_proj": 49.99874496459961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14521215856075287, "geo/layer_21/attn_entropy_mean": 5.90287446975708, "geo/layer_21/attn_entropy_std": 0.3168512284755707, "geo/layer_27/stable_rank_q_proj": 43.94272232055664, "geo/layer_27/stable_rank_k_proj": 31.186260223388672, "geo/layer_27/stable_rank_o_proj": 113.77881622314453, "geo/layer_27/stable_rank_gate_proj": 75.32414245605469, "geo/layer_27/stable_rank_down_proj": 127.91251373291016, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09868407994508743, "geo/layer_27/attn_entropy_mean": 4.324930191040039, "geo/layer_27/attn_entropy_std": 0.6410542130470276, "attnres/final_alpha/block_0": 0.24800480902194977, "attnres/block_norm/0": 1.78115713596344, "attnres/final_alpha/block_1": 0.0040464382618665695, "attnres/block_norm/1": 49109.71875, "attnres/final_alpha/block_2": 0.008869694545865059, "attnres/block_norm/2": 29456.94921875, "attnres/final_alpha/block_3": 0.010533452033996582, "attnres/block_norm/3": 67184.953125, "attnres/final_alpha/block_4": 0.012280376628041267, "attnres/block_norm/4": 16432.98046875, "attnres/final_alpha/block_5": 0.612075686454773, "attnres/block_norm/5": 6883.25341796875, "attnres/final_alpha/block_6": 0.10418953746557236, "attnres/block_norm/6": 44602.33984375, "geo/tier1_time_s": 1.3630445003509521, "geo/step": 2175.0, "geo/rankme_slope": 0.0027572866516905356} {"step": 2180, "timestamp": 1778328076.9103997, "train/loss": 2.396596646308899, "train/z_loss": 0.0013476628344506025, "train/perplexity": 10.985724379971211, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702538.4436911496, "perf/iters_per_sec": 0.8118335932212589, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2317795276641845, "data/tokens_consumed": 4573888512, "data/tokens_consumed_B": 4.573888512, "train/loss_slope": -1.9773575367599954e-05} {"step": 2190, "timestamp": 1778328087.2614455, "train/loss": 2.3942779541015624, "train/z_loss": 0.001363737601786852, "train/perplexity": 10.960281375107153, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027409.4333579636, "perf/iters_per_sec": 0.9667441527166193, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343998432159425, "data/tokens_consumed": 4594860032, "data/tokens_consumed_B": 4.594860032, "train/loss_slope": -1.8035030508055723e-05} {"step": 2200, "timestamp": 1778328097.5983582, "grad/layer_0/attn": 0.002907285699620843, "grad/layer_0/mlp": 0.003146390663459897, "grad/layer_0/attn_mlp_ratio": 0.9240065580487133, "grad/layer_4/attn": 0.0020137077663093805, "grad/layer_4/mlp": 0.0026168501935899258, "grad/layer_4/attn_mlp_ratio": 0.7695158455346239, "grad/layer_8/attn": 0.003915534354746342, "grad/layer_8/mlp": 0.003327655140310526, "grad/layer_8/attn_mlp_ratio": 1.1766646698595364, "grad/layer_12/attn": 0.005851376801729202, "grad/layer_12/mlp": 0.006972252391278744, "grad/layer_12/attn_mlp_ratio": 0.8392376508235193, "grad/layer_16/attn": 0.0035475485492497683, "grad/layer_16/mlp": 0.004599128849804401, "grad/layer_16/attn_mlp_ratio": 0.7713522686509244, "grad/layer_20/attn": 0.004214311949908733, "grad/layer_20/mlp": 0.005780295003205538, "grad/layer_20/attn_mlp_ratio": 0.729082490541293, "grad/layer_24/attn": 0.007860975340008736, "grad/layer_24/mlp": 0.009559404104948044, "grad/layer_24/attn_mlp_ratio": 0.8223290041381257, "grad/layer_27/attn": 0.006973435170948505, "grad/layer_27/mlp": 0.008548282086849213, "grad/layer_27/attn_mlp_ratio": 0.8157703522792601} {"step": 2200, "timestamp": 1778328097.6140094, "train/loss": 2.385213851928711, "train/z_loss": 0.0013683043303899466, "train/perplexity": 10.861385144739947, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027186.6505704157, "perf/iters_per_sec": 0.9666379216053084, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034513521194458, "data/tokens_consumed": 4615831552, "data/tokens_consumed_B": 4.615831552, "train/loss_slope": -1.848111255656057e-05} {"step": 2210, "timestamp": 1778328107.9689326, "train/loss": 2.4029191970825194, "train/z_loss": 0.0013400580384768546, "train/perplexity": 11.055402218873605, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026348.476697354, "perf/iters_per_sec": 0.9662382491575976, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349414348602295, "data/tokens_consumed": 4636803072, "data/tokens_consumed_B": 4.636803072, "train/loss_slope": -1.9077479771368908e-05} {"step": 2220, "timestamp": 1778328118.3246956, "train/loss": 2.4211573362350465, "train/z_loss": 0.0013450881582684816, "train/perplexity": 11.258882089772236, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026641.067623961, "perf/iters_per_sec": 0.9663777673835569, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347920179367065, "data/tokens_consumed": 4657774592, "data/tokens_consumed_B": 4.657774592, "train/loss_slope": -1.9182769436039412e-05} {"step": 2230, "timestamp": 1778328128.6701305, "train/loss": 2.3909103870391846, "train/z_loss": 0.0013516847975552082, "train/perplexity": 10.923433970429462, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028270.9344257903, "perf/iters_per_sec": 0.9671549484375908, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033960485458374, "data/tokens_consumed": 4678746112, "data/tokens_consumed_B": 4.678746112, "train/loss_slope": -2.0593762133095058e-05} {"step": 2240, "timestamp": 1778328139.0210145, "train/loss": 2.4019981622695923, "train/z_loss": 0.001348260510712862, "train/perplexity": 11.045224496296997, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027100.9707323455, "perf/iters_per_sec": 0.9665970662748077, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345572471618651, "data/tokens_consumed": 4699717632, "data/tokens_consumed_B": 4.699717632, "train/loss_slope": -2.1572002633498538e-05} {"step": 2250, "timestamp": 1778328149.360257, "grad/layer_0/attn": 0.003432986093685031, "grad/layer_0/mlp": 0.0033108298666775227, "grad/layer_0/attn_mlp_ratio": 1.0368959228462264, "grad/layer_4/attn": 0.0021734111942350864, "grad/layer_4/mlp": 0.0027796148788183928, "grad/layer_4/attn_mlp_ratio": 0.7819108800309451, "grad/layer_8/attn": 0.004073006100952625, "grad/layer_8/mlp": 0.003705750685185194, "grad/layer_8/attn_mlp_ratio": 1.0991041591991668, "grad/layer_12/attn": 0.0057005626149475574, "grad/layer_12/mlp": 0.006353935692459345, "grad/layer_12/attn_mlp_ratio": 0.8971703210650649, "grad/layer_16/attn": 0.004139365162700415, "grad/layer_16/mlp": 0.005138882901519537, "grad/layer_16/attn_mlp_ratio": 0.8054990085348177, "grad/layer_20/attn": 0.00429519172757864, "grad/layer_20/mlp": 0.00667586037889123, "grad/layer_20/attn_mlp_ratio": 0.6433914760741096, "grad/layer_24/attn": 0.012389821000397205, "grad/layer_24/mlp": 0.010058669373393059, "grad/layer_24/attn_mlp_ratio": 1.2317554556465395, "grad/layer_27/attn": 0.008476871997117996, "grad/layer_27/mlp": 0.009833508171141148, "grad/layer_27/attn_mlp_ratio": 0.862039443440086} {"step": 2250, "timestamp": 1778328149.9596043, "eos/sharpness": 55.77280521392821, "eos/L0_probe": 2.3964877128601074, "eos/L_plus": 2.7317352294921875, "eos/L_minus": 2.6189682483673096, "eos/grad_norm": 0.16699646413326263, "eos/embed_grad_frac": 0.08479867875576019, "eos/time_s": 0.5963623523712158} {"step": 2250, "timestamp": 1778328149.980757, "train/loss": 2.393096661567688, "train/z_loss": 0.0013475525425747036, "train/perplexity": 10.947341720812803, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914789.2157860815, "perf/iters_per_sec": 0.9130426482134254, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0952390909194947, "data/tokens_consumed": 4720689152, "data/tokens_consumed_B": 4.720689152, "train/loss_slope": -1.8758477701140796e-05} {"step": 2250, "timestamp": 1778328151.3448148, "geo/rankme_last": 423.77593994140625, "geo/layer_0/stable_rank_q_proj": 20.23538589477539, "geo/layer_0/stable_rank_k_proj": 17.041505813598633, "geo/layer_0/stable_rank_o_proj": 46.06818389892578, "geo/layer_0/stable_rank_gate_proj": 129.74134826660156, "geo/layer_0/stable_rank_down_proj": 55.65164566040039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06489887088537216, "geo/layer_0/attn_entropy_mean": 6.253391265869141, "geo/layer_0/attn_entropy_std": 0.41111403703689575, "geo/layer_7/stable_rank_q_proj": 42.55546569824219, "geo/layer_7/stable_rank_k_proj": 39.75666427612305, "geo/layer_7/stable_rank_o_proj": 88.8738784790039, "geo/layer_7/stable_rank_gate_proj": 79.307373046875, "geo/layer_7/stable_rank_down_proj": 143.40371704101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4169238805770874, "geo/layer_7/attn_entropy_mean": 4.718939781188965, "geo/layer_7/attn_entropy_std": 0.7790606617927551, "geo/layer_14/stable_rank_q_proj": 50.84926986694336, "geo/layer_14/stable_rank_k_proj": 41.44032287597656, "geo/layer_14/stable_rank_o_proj": 42.90827178955078, "geo/layer_14/stable_rank_gate_proj": 71.4931411743164, "geo/layer_14/stable_rank_down_proj": 126.93079376220703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3797893524169922, "geo/layer_14/attn_entropy_mean": 5.507724285125732, "geo/layer_14/attn_entropy_std": 0.4276416599750519, "geo/layer_21/stable_rank_q_proj": 39.975303649902344, "geo/layer_21/stable_rank_k_proj": 29.744091033935547, "geo/layer_21/stable_rank_o_proj": 67.1575698852539, "geo/layer_21/stable_rank_gate_proj": 63.48231506347656, "geo/layer_21/stable_rank_down_proj": 50.02419662475586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1419878900051117, "geo/layer_21/attn_entropy_mean": 5.8930344581604, "geo/layer_21/attn_entropy_std": 0.3100285828113556, "geo/layer_27/stable_rank_q_proj": 43.884124755859375, "geo/layer_27/stable_rank_k_proj": 31.148218154907227, "geo/layer_27/stable_rank_o_proj": 113.50357818603516, "geo/layer_27/stable_rank_gate_proj": 75.16447448730469, "geo/layer_27/stable_rank_down_proj": 127.73299407958984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10725785791873932, "geo/layer_27/attn_entropy_mean": 4.322071552276611, "geo/layer_27/attn_entropy_std": 0.6560929417610168, "attnres/final_alpha/block_0": 0.24850618839263916, "attnres/block_norm/0": 1.7810313701629639, "attnres/final_alpha/block_1": 0.004027861636132002, "attnres/block_norm/1": 49137.2734375, "attnres/final_alpha/block_2": 0.008893347345292568, "attnres/block_norm/2": 29526.4453125, "attnres/final_alpha/block_3": 0.010664287954568863, "attnres/block_norm/3": 67567.140625, "attnres/final_alpha/block_4": 0.012279339134693146, "attnres/block_norm/4": 16449.66796875, "attnres/final_alpha/block_5": 0.6135768294334412, "attnres/block_norm/5": 6884.763671875, "attnres/final_alpha/block_6": 0.10205215215682983, "attnres/block_norm/6": 44897.78125, "geo/tier1_time_s": 1.3598358631134033, "geo/step": 2250.0, "geo/rankme_slope": 0.0026862714213709677} {"step": 2260, "timestamp": 1778328162.1565452, "train/loss": 2.425506353378296, "train/z_loss": 0.0013451822916977108, "train/perplexity": 11.30795379048335, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1723000.277116473, "perf/iters_per_sec": 0.8215905557234159, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2171512842178345, "data/tokens_consumed": 4741660672, "data/tokens_consumed_B": 4.741660672, "train/loss_slope": -1.655682136874425e-05} {"step": 2270, "timestamp": 1778328172.5043895, "train/loss": 2.3590482234954835, "train/z_loss": 0.0013671522145159542, "train/perplexity": 10.580876029217016, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027701.1621970423, "perf/iters_per_sec": 0.9668832598672115, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342510223388672, "data/tokens_consumed": 4762632192, "data/tokens_consumed_B": 4.762632192, "train/loss_slope": -1.8508693072447506e-05} {"step": 2280, "timestamp": 1778328182.8515465, "train/loss": 2.3827693462371826, "train/z_loss": 0.0013518092455342412, "train/perplexity": 10.834866852199513, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028038.0026250319, "perf/iters_per_sec": 0.967043877899662, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340792417526246, "data/tokens_consumed": 4783603712, "data/tokens_consumed_B": 4.783603712, "train/loss_slope": -1.9293299602596675e-05} {"step": 2290, "timestamp": 1778328193.19666, "train/loss": 2.39498918056488, "train/z_loss": 0.0013507761992514133, "train/perplexity": 10.968079390015104, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028298.1546020943, "perf/iters_per_sec": 0.9671679280291053, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339466094970704, "data/tokens_consumed": 4804575232, "data/tokens_consumed_B": 4.804575232, "train/loss_slope": -2.2330439201604493e-05} {"step": 2300, "timestamp": 1778328203.5341208, "grad/layer_0/attn": 0.0026842853985726833, "grad/layer_0/mlp": 0.003041653661057353, "grad/layer_0/attn_mlp_ratio": 0.8825085330026388, "grad/layer_4/attn": 0.002229877980425954, "grad/layer_4/mlp": 0.0025368963833898306, "grad/layer_4/attn_mlp_ratio": 0.878978702925381, "grad/layer_8/attn": 0.0033356621861457825, "grad/layer_8/mlp": 0.00325977080501616, "grad/layer_8/attn_mlp_ratio": 1.0232811701622468, "grad/layer_12/attn": 0.00773636344820261, "grad/layer_12/mlp": 0.006930467672646046, "grad/layer_12/attn_mlp_ratio": 1.1162830132097814, "grad/layer_16/attn": 0.00362344179302454, "grad/layer_16/mlp": 0.004474789369851351, "grad/layer_16/attn_mlp_ratio": 0.8097457584177493, "grad/layer_20/attn": 0.002996265422552824, "grad/layer_20/mlp": 0.005688192788511515, "grad/layer_20/attn_mlp_ratio": 0.5267517261245489, "grad/layer_24/attn": 0.008456515148282051, "grad/layer_24/mlp": 0.00886695645749569, "grad/layer_24/attn_mlp_ratio": 0.953711128891607, "grad/layer_27/attn": 0.008480263873934746, "grad/layer_27/mlp": 0.007951904088258743, "grad/layer_27/attn_mlp_ratio": 1.0664444230170862} {"step": 2300, "timestamp": 1778328203.5496724, "train/loss": 2.3793892860412598, "train/z_loss": 0.0013529416639357806, "train/perplexity": 10.798306173479926, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026573.8767547384, "perf/iters_per_sec": 0.9663457282804195, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348263263702393, "data/tokens_consumed": 4825546752, "data/tokens_consumed_B": 4.825546752, "train/loss_slope": -2.5830579254672306e-05} {"step": 2310, "timestamp": 1778328213.9037287, "train/loss": 2.405556344985962, "train/z_loss": 0.0013528119772672654, "train/perplexity": 11.084595426140691, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026926.5967928213, "perf/iters_per_sec": 0.9665139183010203, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034646248817444, "data/tokens_consumed": 4846518272, "data/tokens_consumed_B": 4.846518272, "train/loss_slope": -2.6842487133768942e-05} {"step": 2320, "timestamp": 1778328224.7467504, "train/loss": 2.422201132774353, "train/z_loss": 0.001346436992753297, "train/perplexity": 11.2706402074077, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935055.341801274, "perf/iters_per_sec": 0.9227062901502963, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0837684869766235, "data/tokens_consumed": 4867489792, "data/tokens_consumed_B": 4.867489792, "train/loss_slope": -2.412132666055914e-05} {"step": 2325, "timestamp": 1778328230.5090094, "eos/sharpness": 8.687901496887205, "eos/L0_probe": 2.39736270904541, "eos/L_plus": 2.442570447921753, "eos/L_minus": 2.4390339851379395, "eos/grad_norm": 0.11050863564014435, "eos/embed_grad_frac": 0.2662799656391144, "eos/time_s": 0.5930593013763428} {"step": 2325, "timestamp": 1778328231.884691, "geo/rankme_last": 424.2142639160156, "geo/layer_0/stable_rank_q_proj": 20.24764060974121, "geo/layer_0/stable_rank_k_proj": 17.05864906311035, "geo/layer_0/stable_rank_o_proj": 46.0378532409668, "geo/layer_0/stable_rank_gate_proj": 129.6171417236328, "geo/layer_0/stable_rank_down_proj": 55.631080627441406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06913790106773376, "geo/layer_0/attn_entropy_mean": 6.2597808837890625, "geo/layer_0/attn_entropy_std": 0.4102654457092285, "geo/layer_7/stable_rank_q_proj": 42.61616134643555, "geo/layer_7/stable_rank_k_proj": 39.75422668457031, "geo/layer_7/stable_rank_o_proj": 88.96492767333984, "geo/layer_7/stable_rank_gate_proj": 79.18180084228516, "geo/layer_7/stable_rank_down_proj": 143.30897521972656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42447778582572937, "geo/layer_7/attn_entropy_mean": 4.7098798751831055, "geo/layer_7/attn_entropy_std": 0.7637867331504822, "geo/layer_14/stable_rank_q_proj": 50.894081115722656, "geo/layer_14/stable_rank_k_proj": 41.396766662597656, "geo/layer_14/stable_rank_o_proj": 42.95598602294922, "geo/layer_14/stable_rank_gate_proj": 71.5340576171875, "geo/layer_14/stable_rank_down_proj": 127.04324340820312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37732744216918945, "geo/layer_14/attn_entropy_mean": 5.4940643310546875, "geo/layer_14/attn_entropy_std": 0.43610745668411255, "geo/layer_21/stable_rank_q_proj": 39.94571304321289, "geo/layer_21/stable_rank_k_proj": 29.675247192382812, "geo/layer_21/stable_rank_o_proj": 67.1545639038086, "geo/layer_21/stable_rank_gate_proj": 63.445587158203125, "geo/layer_21/stable_rank_down_proj": 50.03829574584961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14060071110725403, "geo/layer_21/attn_entropy_mean": 5.88982629776001, "geo/layer_21/attn_entropy_std": 0.3183498680591583, "geo/layer_27/stable_rank_q_proj": 43.819297790527344, "geo/layer_27/stable_rank_k_proj": 31.18482208251953, "geo/layer_27/stable_rank_o_proj": 113.28607177734375, "geo/layer_27/stable_rank_gate_proj": 75.11112213134766, "geo/layer_27/stable_rank_down_proj": 127.53109741210938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10681360960006714, "geo/layer_27/attn_entropy_mean": 4.336491107940674, "geo/layer_27/attn_entropy_std": 0.6564626097679138, "attnres/final_alpha/block_0": 0.24976755678653717, "attnres/block_norm/0": 1.7807742357254028, "attnres/final_alpha/block_1": 0.004030722193419933, "attnres/block_norm/1": 49194.14453125, "attnres/final_alpha/block_2": 0.00891672819852829, "attnres/block_norm/2": 29525.0703125, "attnres/final_alpha/block_3": 0.010707061737775803, "attnres/block_norm/3": 66920.28125, "attnres/final_alpha/block_4": 0.012371175922453403, "attnres/block_norm/4": 16455.42578125, "attnres/final_alpha/block_5": 0.6101118326187134, "attnres/block_norm/5": 6867.38623046875, "attnres/final_alpha/block_6": 0.10409495234489441, "attnres/block_norm/6": 44668.7734375, "geo/tier1_time_s": 1.3553671836853027, "geo/step": 2325.0, "geo/rankme_slope": 0.0026457080113922744} {"step": 2330, "timestamp": 1778328237.0595856, "train/loss": 2.3795103788375855, "train/z_loss": 0.0013639318058267236, "train/perplexity": 10.799613849743544, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704094.355731088, "perf/iters_per_sec": 0.8125755098967972, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2306548595428466, "data/tokens_consumed": 4888461312, "data/tokens_consumed_B": 4.888461312, "train/loss_slope": -2.4246550979751622e-05} {"step": 2340, "timestamp": 1778328247.4132626, "train/loss": 2.3942492961883546, "train/z_loss": 0.0013480760855600238, "train/perplexity": 10.959967280815436, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026560.149633942, "perf/iters_per_sec": 0.9663391826791486, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348333358764648, "data/tokens_consumed": 4909432832, "data/tokens_consumed_B": 4.909432832, "train/loss_slope": -2.2064598852043733e-05} {"step": 2350, "timestamp": 1778328257.746637, "grad/layer_0/attn": 0.0031852927058935165, "grad/layer_0/mlp": 0.0035877320915460587, "grad/layer_0/attn_mlp_ratio": 0.8878290061335042, "grad/layer_4/attn": 0.0018207294633612037, "grad/layer_4/mlp": 0.002625636290758848, "grad/layer_4/attn_mlp_ratio": 0.6934431095521896, "grad/layer_8/attn": 0.004582044668495655, "grad/layer_8/mlp": 0.0036451073829084635, "grad/layer_8/attn_mlp_ratio": 1.2570396593187976, "grad/layer_12/attn": 0.0060598114505410194, "grad/layer_12/mlp": 0.0066400254145264626, "grad/layer_12/attn_mlp_ratio": 0.9126186996245569, "grad/layer_16/attn": 0.0047614253126084805, "grad/layer_16/mlp": 0.004478916991502047, "grad/layer_16/attn_mlp_ratio": 1.063075117340851, "grad/layer_20/attn": 0.003273274516686797, "grad/layer_20/mlp": 0.005924512632191181, "grad/layer_20/attn_mlp_ratio": 0.5524968321700571, "grad/layer_24/attn": 0.008400147780776024, "grad/layer_24/mlp": 0.008563431911170483, "grad/layer_24/attn_mlp_ratio": 0.9809323843312513, "grad/layer_27/attn": 0.004378214944154024, "grad/layer_27/mlp": 0.007397498935461044, "grad/layer_27/attn_mlp_ratio": 0.5918506948316425} {"step": 2350, "timestamp": 1778328257.7623003, "train/loss": 2.3745399713516235, "train/z_loss": 0.0013652005465701223, "train/perplexity": 10.746068549436144, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027355.9292487921, "perf/iters_per_sec": 0.9667186399692498, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344271421432496, "data/tokens_consumed": 4930404352, "data/tokens_consumed_B": 4.930404352, "train/loss_slope": -2.4607282438830455e-05} {"step": 2360, "timestamp": 1778328268.1263456, "train/loss": 2.430197501182556, "train/z_loss": 0.0013425962184555828, "train/perplexity": 11.361125694195707, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024998.9000771684, "perf/iters_per_sec": 0.9655947208772508, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356311798095703, "data/tokens_consumed": 4951375872, "data/tokens_consumed_B": 4.951375872, "train/loss_slope": -2.1273319277003728e-05} {"step": 2370, "timestamp": 1778328278.4797583, "train/loss": 2.358388376235962, "train/z_loss": 0.0013681300450116396, "train/perplexity": 10.573896570107523, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026977.2289417302, "perf/iters_per_sec": 0.9665380615910197, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346204042434692, "data/tokens_consumed": 4972347392, "data/tokens_consumed_B": 4.972347392, "train/loss_slope": -2.190927354702179e-05} {"step": 2380, "timestamp": 1778328288.840596, "train/loss": 2.386816930770874, "train/z_loss": 0.0013570739538408816, "train/perplexity": 10.878810765051895, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025071.5345519457, "perf/iters_per_sec": 0.9656293556937913, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355940341949463, "data/tokens_consumed": 4993318912, "data/tokens_consumed_B": 4.993318912, "train/loss_slope": -2.3314597361301697e-05} {"step": 2390, "timestamp": 1778328299.1939554, "train/loss": 2.4146070957183836, "train/z_loss": 0.0013538561062887311, "train/perplexity": 11.185374712459355, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026361.967543489, "perf/iters_per_sec": 0.9662446820943303, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349345445632934, "data/tokens_consumed": 5014290432, "data/tokens_consumed_B": 5.014290432, "train/loss_slope": -2.248383333759556e-05} {"step": 2400, "timestamp": 1778328309.5334892, "grad/layer_0/attn": 0.002391211921349168, "grad/layer_0/mlp": 0.0028235996142029762, "grad/layer_0/attn_mlp_ratio": 0.846866469535729, "grad/layer_4/attn": 0.0019760087598115206, "grad/layer_4/mlp": 0.0026288400404155254, "grad/layer_4/attn_mlp_ratio": 0.7516656222006655, "grad/layer_8/attn": 0.003604706609621644, "grad/layer_8/mlp": 0.003562930040061474, "grad/layer_8/attn_mlp_ratio": 1.011725312570807, "grad/layer_12/attn": 0.011131569743156433, "grad/layer_12/mlp": 0.007401547394692898, "grad/layer_12/attn_mlp_ratio": 1.5039516737733643, "grad/layer_16/attn": 0.007725783623754978, "grad/layer_16/mlp": 0.004225858952850103, "grad/layer_16/attn_mlp_ratio": 1.8282161158556327, "grad/layer_20/attn": 0.003614610293880105, "grad/layer_20/mlp": 0.005824392661452293, "grad/layer_20/attn_mlp_ratio": 0.620598651554332, "grad/layer_24/attn": 0.010247493162751198, "grad/layer_24/mlp": 0.010029159486293793, "grad/layer_24/attn_mlp_ratio": 1.0217698775833408, "grad/layer_27/attn": 0.010348711162805557, "grad/layer_27/mlp": 0.009854987263679504, "grad/layer_27/attn_mlp_ratio": 1.050098876934705} {"step": 2400, "timestamp": 1778328310.1259992, "eos/sharpness": 52.02469825744628, "eos/L0_probe": 2.3925509452819824, "eos/L_plus": 2.7009313106536865, "eos/L_minus": 2.604417562484741, "eos/grad_norm": 0.16161400079727173, "eos/embed_grad_frac": 0.09120568633079529, "eos/time_s": 0.589759349822998} {"step": 2400, "timestamp": 1778328310.1455834, "train/loss": 2.3533233404159546, "train/z_loss": 0.0013380352989770471, "train/perplexity": 10.520474810993997, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916129.1981558998, "perf/iters_per_sec": 0.9136816015986918, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.094473171234131, "data/tokens_consumed": 5035261952, "data/tokens_consumed_B": 5.035261952, "train/loss_slope": -2.3985305434764078e-05} {"step": 2400, "timestamp": 1778328311.5114768, "geo/rankme_last": 424.2814025878906, "geo/layer_0/stable_rank_q_proj": 20.269227981567383, "geo/layer_0/stable_rank_k_proj": 17.06364631652832, "geo/layer_0/stable_rank_o_proj": 45.94984436035156, "geo/layer_0/stable_rank_gate_proj": 129.78863525390625, "geo/layer_0/stable_rank_down_proj": 55.653358459472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06719887256622314, "geo/layer_0/attn_entropy_mean": 6.25544548034668, "geo/layer_0/attn_entropy_std": 0.41379404067993164, "geo/layer_7/stable_rank_q_proj": 42.6092643737793, "geo/layer_7/stable_rank_k_proj": 39.685829162597656, "geo/layer_7/stable_rank_o_proj": 89.07536315917969, "geo/layer_7/stable_rank_gate_proj": 79.18476867675781, "geo/layer_7/stable_rank_down_proj": 143.30386352539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4215128421783447, "geo/layer_7/attn_entropy_mean": 4.731573104858398, "geo/layer_7/attn_entropy_std": 0.761094331741333, "geo/layer_14/stable_rank_q_proj": 50.812870025634766, "geo/layer_14/stable_rank_k_proj": 41.41036605834961, "geo/layer_14/stable_rank_o_proj": 42.914405822753906, "geo/layer_14/stable_rank_gate_proj": 71.4632797241211, "geo/layer_14/stable_rank_down_proj": 127.2665786743164, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3808364272117615, "geo/layer_14/attn_entropy_mean": 5.515851974487305, "geo/layer_14/attn_entropy_std": 0.41467973589897156, "geo/layer_21/stable_rank_q_proj": 39.95652770996094, "geo/layer_21/stable_rank_k_proj": 29.632686614990234, "geo/layer_21/stable_rank_o_proj": 67.14924621582031, "geo/layer_21/stable_rank_gate_proj": 63.51901626586914, "geo/layer_21/stable_rank_down_proj": 50.01002502441406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13823017477989197, "geo/layer_21/attn_entropy_mean": 5.891812324523926, "geo/layer_21/attn_entropy_std": 0.3170314431190491, "geo/layer_27/stable_rank_q_proj": 43.80049133300781, "geo/layer_27/stable_rank_k_proj": 31.12742805480957, "geo/layer_27/stable_rank_o_proj": 113.06253051757812, "geo/layer_27/stable_rank_gate_proj": 75.12297821044922, "geo/layer_27/stable_rank_down_proj": 127.6615982055664, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09975804388523102, "geo/layer_27/attn_entropy_mean": 4.339549541473389, "geo/layer_27/attn_entropy_std": 0.6502915620803833, "attnres/final_alpha/block_0": 0.24807247519493103, "attnres/block_norm/0": 1.780456304550171, "attnres/final_alpha/block_1": 0.004052416421473026, "attnres/block_norm/1": 49117.3203125, "attnres/final_alpha/block_2": 0.00880996510386467, "attnres/block_norm/2": 29582.5078125, "attnres/final_alpha/block_3": 0.01053192000836134, "attnres/block_norm/3": 66826.953125, "attnres/final_alpha/block_4": 0.012212282046675682, "attnres/block_norm/4": 16480.201171875, "attnres/final_alpha/block_5": 0.6129575371742249, "attnres/block_norm/5": 6888.5029296875, "attnres/final_alpha/block_6": 0.10336337983608246, "attnres/block_norm/6": 44875.5703125, "geo/tier1_time_s": 1.3617243766784668, "geo/step": 2400.0, "geo/rankme_slope": 0.002602724833496964} {"step": 2410, "timestamp": 1778328322.2520278, "train/loss": 2.376794672012329, "train/z_loss": 0.0013489926001057028, "train/perplexity": 10.77032505259538, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1732853.9933879592, "perf/iters_per_sec": 0.8262891737880512, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2102300643920898, "data/tokens_consumed": 5056233472, "data/tokens_consumed_B": 5.056233472, "train/loss_slope": -2.4151922767311834e-05} {"step": 2420, "timestamp": 1778328332.6163313, "train/loss": 2.4069001197814943, "train/z_loss": 0.0013586937449872494, "train/perplexity": 11.099500638474442, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024856.257031559, "perf/iters_per_sec": 0.965526703372745, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357041358947754, "data/tokens_consumed": 5077204992, "data/tokens_consumed_B": 5.077204992, "train/loss_slope": -2.44988998850294e-05} {"step": 2430, "timestamp": 1778328342.9677136, "train/loss": 2.4120054960250856, "train/z_loss": 0.0013480013236403464, "train/perplexity": 11.156312665336209, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027077.6132542742, "perf/iters_per_sec": 0.9665859285613414, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345691680908202, "data/tokens_consumed": 5098176512, "data/tokens_consumed_B": 5.098176512, "train/loss_slope": -2.231275279207912e-05} {"step": 2440, "timestamp": 1778328353.3211462, "train/loss": 2.4438881635665894, "train/z_loss": 0.001347909972537309, "train/perplexity": 11.517736637975895, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026606.7945877637, "perf/iters_per_sec": 0.9663614247263735, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348095178604126, "data/tokens_consumed": 5119148032, "data/tokens_consumed_B": 5.119148032, "train/loss_slope": -2.0959156767727533e-05} {"step": 2450, "timestamp": 1778328363.6586716, "grad/layer_0/attn": 0.0035855157766491175, "grad/layer_0/mlp": 0.0036889221519231796, "grad/layer_0/attn_mlp_ratio": 0.9719683776961815, "grad/layer_4/attn": 0.004059800878167152, "grad/layer_4/mlp": 0.002756875939667225, "grad/layer_4/attn_mlp_ratio": 1.4726091488165713, "grad/layer_8/attn": 0.0056571951135993, "grad/layer_8/mlp": 0.0035696299746632576, "grad/layer_8/attn_mlp_ratio": 1.5848127103571017, "grad/layer_12/attn": 0.009167805314064026, "grad/layer_12/mlp": 0.006928754039108753, "grad/layer_12/attn_mlp_ratio": 1.323153503501754, "grad/layer_16/attn": 0.003681130474433303, "grad/layer_16/mlp": 0.004589981399476528, "grad/layer_16/attn_mlp_ratio": 0.8019924426390659, "grad/layer_20/attn": 0.0032571477349847555, "grad/layer_20/mlp": 0.006680003833025694, "grad/layer_20/attn_mlp_ratio": 0.4875966792297138, "grad/layer_24/attn": 0.01430789940059185, "grad/layer_24/mlp": 0.011582432314753532, "grad/layer_24/attn_mlp_ratio": 1.235310415657307, "grad/layer_27/attn": 0.005825111176818609, "grad/layer_27/mlp": 0.011922202073037624, "grad/layer_27/attn_mlp_ratio": 0.48859355782375946} {"step": 2450, "timestamp": 1778328363.6746073, "train/loss": 2.365058946609497, "train/z_loss": 0.0013587026856839656, "train/perplexity": 10.644666266002453, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026673.614073353, "perf/iters_per_sec": 0.9663932867399945, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034775400161743, "data/tokens_consumed": 5140119552, "data/tokens_consumed_B": 5.140119552, "train/loss_slope": -1.987397701504921e-05} {"step": 2460, "timestamp": 1778328374.0298839, "train/loss": 2.4028865098953247, "train/z_loss": 0.0013497515115886926, "train/perplexity": 11.055040854777783, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026692.5260485183, "perf/iters_per_sec": 0.9664023046724883, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347657442092895, "data/tokens_consumed": 5161091072, "data/tokens_consumed_B": 5.161091072, "train/loss_slope": -1.963657693322126e-05} {"step": 2470, "timestamp": 1778328384.38984, "train/loss": 2.3614281892776487, "train/z_loss": 0.001362142339348793, "train/perplexity": 10.606088142194363, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025254.5890798212, "perf/iters_per_sec": 0.9657166428946596, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035500431060791, "data/tokens_consumed": 5182062592, "data/tokens_consumed_B": 5.182062592, "train/loss_slope": -2.1420572686045136e-05} {"step": 2475, "timestamp": 1778328390.1497934, "eos/sharpness": 49.07031059265136, "eos/L0_probe": 2.391890287399292, "eos/L_plus": 2.5761330127716064, "eos/L_minus": 2.698350667953491, "eos/grad_norm": 0.1237301379442215, "eos/embed_grad_frac": 0.1521318405866623, "eos/time_s": 0.5953283309936523} {"step": 2475, "timestamp": 1778328391.5281587, "geo/rankme_last": 424.26641845703125, "geo/layer_0/stable_rank_q_proj": 20.2879695892334, "geo/layer_0/stable_rank_k_proj": 17.09723472595215, "geo/layer_0/stable_rank_o_proj": 45.951263427734375, "geo/layer_0/stable_rank_gate_proj": 129.75694274902344, "geo/layer_0/stable_rank_down_proj": 55.76617431640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06551600247621536, "geo/layer_0/attn_entropy_mean": 6.257180213928223, "geo/layer_0/attn_entropy_std": 0.41120731830596924, "geo/layer_7/stable_rank_q_proj": 42.66107177734375, "geo/layer_7/stable_rank_k_proj": 39.6453857421875, "geo/layer_7/stable_rank_o_proj": 89.03449249267578, "geo/layer_7/stable_rank_gate_proj": 79.11043548583984, "geo/layer_7/stable_rank_down_proj": 143.04440307617188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4315358102321625, "geo/layer_7/attn_entropy_mean": 4.702018737792969, "geo/layer_7/attn_entropy_std": 0.77214515209198, "geo/layer_14/stable_rank_q_proj": 50.789329528808594, "geo/layer_14/stable_rank_k_proj": 41.47807312011719, "geo/layer_14/stable_rank_o_proj": 42.89939498901367, "geo/layer_14/stable_rank_gate_proj": 71.53455352783203, "geo/layer_14/stable_rank_down_proj": 127.4603271484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36532896757125854, "geo/layer_14/attn_entropy_mean": 5.5410380363464355, "geo/layer_14/attn_entropy_std": 0.43369609117507935, "geo/layer_21/stable_rank_q_proj": 40.01932907104492, "geo/layer_21/stable_rank_k_proj": 29.546825408935547, "geo/layer_21/stable_rank_o_proj": 67.09131622314453, "geo/layer_21/stable_rank_gate_proj": 63.509490966796875, "geo/layer_21/stable_rank_down_proj": 49.99168014526367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13658584654331207, "geo/layer_21/attn_entropy_mean": 5.870508193969727, "geo/layer_21/attn_entropy_std": 0.3195915222167969, "geo/layer_27/stable_rank_q_proj": 43.77756881713867, "geo/layer_27/stable_rank_k_proj": 31.14167594909668, "geo/layer_27/stable_rank_o_proj": 112.93399047851562, "geo/layer_27/stable_rank_gate_proj": 75.02678680419922, "geo/layer_27/stable_rank_down_proj": 127.6961669921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09882080554962158, "geo/layer_27/attn_entropy_mean": 4.318033218383789, "geo/layer_27/attn_entropy_std": 0.6567201018333435, "attnres/final_alpha/block_0": 0.25090691447257996, "attnres/block_norm/0": 1.78019118309021, "attnres/final_alpha/block_1": 0.0040755318477749825, "attnres/block_norm/1": 49437.9453125, "attnres/final_alpha/block_2": 0.00883010495454073, "attnres/block_norm/2": 29643.892578125, "attnres/final_alpha/block_3": 0.010655669495463371, "attnres/block_norm/3": 67293.9609375, "attnres/final_alpha/block_4": 0.012625524774193764, "attnres/block_norm/4": 16499.72265625, "attnres/final_alpha/block_5": 0.6076708436012268, "attnres/block_norm/5": 6951.373046875, "attnres/final_alpha/block_6": 0.10523544251918793, "attnres/block_norm/6": 44910.39453125, "geo/tier1_time_s": 1.3589198589324951, "geo/step": 2475.0, "geo/rankme_slope": 0.0025527631595962247} {"step": 2480, "timestamp": 1778328396.705447, "train/loss": 2.361788034439087, "train/z_loss": 0.0013487342046573758, "train/perplexity": 10.609905378459928, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703625.6866013426, "perf/iters_per_sec": 0.8123520310408319, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2309934139251708, "data/tokens_consumed": 5203034112, "data/tokens_consumed_B": 5.203034112, "train/loss_slope": -2.2457401708836946e-05} {"step": 2490, "timestamp": 1778328407.0472107, "train/loss": 2.4241008043289183, "train/z_loss": 0.0013544932240620255, "train/perplexity": 11.29207107136789, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028763.489389248, "perf/iters_per_sec": 0.9673898169466247, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033709454536438, "data/tokens_consumed": 5224005632, "data/tokens_consumed_B": 5.224005632, "train/loss_slope": -2.0389611650221426e-05} {"step": 2500, "timestamp": 1778328417.382861, "grad/layer_0/attn": 0.003132287412881851, "grad/layer_0/mlp": 0.003300608368590474, "grad/layer_0/attn_mlp_ratio": 0.9490030225304171, "grad/layer_4/attn": 0.002112290123477578, "grad/layer_4/mlp": 0.002790259663015604, "grad/layer_4/attn_mlp_ratio": 0.7570227516002658, "grad/layer_8/attn": 0.002999903168529272, "grad/layer_8/mlp": 0.0036312362644821405, "grad/layer_8/attn_mlp_ratio": 0.8261382260521329, "grad/layer_12/attn": 0.007279500365257263, "grad/layer_12/mlp": 0.006940709426999092, "grad/layer_12/attn_mlp_ratio": 1.0488121332466498, "grad/layer_16/attn": 0.010878480970859528, "grad/layer_16/mlp": 0.0044878460466861725, "grad/layer_16/attn_mlp_ratio": 2.4239870564395805, "grad/layer_20/attn": 0.0038646662142127752, "grad/layer_20/mlp": 0.006148673593997955, "grad/layer_20/attn_mlp_ratio": 0.628536560329308, "grad/layer_24/attn": 0.015196948312222958, "grad/layer_24/mlp": 0.011629980988800526, "grad/layer_24/attn_mlp_ratio": 1.306704473222004, "grad/layer_27/attn": 0.0098618408665061, "grad/layer_27/mlp": 0.011796037666499615, "grad/layer_27/attn_mlp_ratio": 0.8360299502018742} {"step": 2500, "timestamp": 1778328417.3987017, "train/loss": 2.373078489303589, "train/z_loss": 0.0013498904299922287, "train/perplexity": 10.730374833999269, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026848.3648207365, "perf/iters_per_sec": 0.9664766143897707, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346861839294434, "data/tokens_consumed": 5244977152, "data/tokens_consumed_B": 5.244977152, "train/loss_slope": -2.3876996044636208e-05} {"step": 2500, "timestamp": 1778328424.2812827, "geo/ww_alpha_mean": 7.759011914836437, "geo/ww_alpha_std": 4.8926527370638775, "geo/ww_alpha_min": 1.3358051669370083, "geo/ww_alpha_max": 31.293531120972933, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.922521473320185, "geo/ww_alpha_by_type/k_proj": 4.515877193156365, "geo/ww_alpha_by_type/v_proj": 9.444982989320819, "geo/ww_alpha_by_type/o_proj": 8.893629579369518, "geo/ww_alpha_by_type/gate_proj": 7.894443181771689, "geo/ww_alpha_by_type/up_proj": 12.010092140628553, "geo/ww_alpha_by_type/down_proj": 7.733732221812874, "geo/twonn_id/layer_0": 0.7353553175926208, "geo/twonn_id/layer_7": 3.5456480979919434, "geo/twonn_id/layer_14": 5.503360748291016, "geo/twonn_id/layer_21": 8.830821990966797, "geo/twonn_id/layer_27": 6.442984580993652, "geo/tier2_time_s": 6.8749566078186035} {"step": 2500, "timestamp": 1778328425.031131, "eoc/jacobian_sigma/layer_0/attn": 1373.8880615234375, "eoc/jacobian_sigma/layer_0/mlp": 10248.2998046875, "eoc/jacobian_sigma/layer_0": 10248.2998046875, "eoc/jacobian_sigma/layer_7/attn": 1.0728265047073364, "eoc/jacobian_sigma/layer_7/mlp": 1.7684013843536377, "eoc/jacobian_sigma/layer_7": 1.7684013843536377, "eoc/jacobian_sigma/layer_14/attn": 1.9236929416656494, "eoc/jacobian_sigma/layer_14/mlp": 12.207828521728516, "eoc/jacobian_sigma/layer_14": 12.207828521728516, "eoc/jacobian_sigma/layer_21/attn": 1.0336098670959473, "eoc/jacobian_sigma/layer_21/mlp": 4.712545394897461, "eoc/jacobian_sigma/layer_21": 4.712545394897461, "eoc/jacobian_sigma/layer_27/attn": 3.549346923828125, "eoc/jacobian_sigma/layer_27/mlp": 30.474103927612305, "eoc/jacobian_sigma/layer_27": 30.474103927612305, "eoc/layer0_sigma": 10248.2998046875, "eoc/sigma_max": 30.474103927612305, "eoc/sigma_min": 1.7684013843536377, "eoc/sigma_mean": 12.29071980714798, "eoc/time_s": 0.7435505390167236} {"step": 2510, "timestamp": 1778328435.4025402, "train/loss": 2.3866759538650513, "train/z_loss": 0.0013545312918722629, "train/perplexity": 10.877277212071522, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1165250.8262261646, "perf/iters_per_sec": 0.5556348925715278, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.7997429847717286, "data/tokens_consumed": 5265948672, "data/tokens_consumed_B": 5.265948672, "train/loss_slope": -2.2615104861850315e-05} {"step": 2520, "timestamp": 1778328445.7745287, "train/loss": 2.4160205841064455, "train/z_loss": 0.0013332201982848347, "train/perplexity": 11.201196288904605, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023312.8495104294, "perf/iters_per_sec": 0.9647907493164203, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364941835403443, "data/tokens_consumed": 5286920192, "data/tokens_consumed_B": 5.286920192, "train/loss_slope": -2.133857027174579e-05} {"step": 2530, "timestamp": 1778328456.1225758, "train/loss": 2.4264641284942625, "train/z_loss": 0.0013557707075960933, "train/perplexity": 11.318789455473194, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027912.8840819122, "perf/iters_per_sec": 0.9669842167291223, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034143042564392, "data/tokens_consumed": 5307891712, "data/tokens_consumed_B": 5.307891712, "train/loss_slope": -1.8113698696110155e-05} {"step": 2540, "timestamp": 1778328466.4738414, "train/loss": 2.389151191711426, "train/z_loss": 0.0013579078018665313, "train/perplexity": 10.904234409266264, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026903.8038097692, "perf/iters_per_sec": 0.9665030497597548, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034657883644104, "data/tokens_consumed": 5328863232, "data/tokens_consumed_B": 5.328863232, "train/loss_slope": -1.5692216842839063e-05} {"step": 2550, "timestamp": 1778328476.8229191, "grad/layer_0/attn": 0.002675616880878806, "grad/layer_0/mlp": 0.0030403484124690294, "grad/layer_0/attn_mlp_ratio": 0.8800362425247001, "grad/layer_4/attn": 0.0019134900067001581, "grad/layer_4/mlp": 0.0026767735835164785, "grad/layer_4/attn_mlp_ratio": 0.7148493794912111, "grad/layer_8/attn": 0.004966015461832285, "grad/layer_8/mlp": 0.003396309446543455, "grad/layer_8/attn_mlp_ratio": 1.4621798731173885, "grad/layer_12/attn": 0.0061853015795350075, "grad/layer_12/mlp": 0.006631563417613506, "grad/layer_12/attn_mlp_ratio": 0.9327063765742089, "grad/layer_16/attn": 0.003590776352211833, "grad/layer_16/mlp": 0.004395193420350552, "grad/layer_16/attn_mlp_ratio": 0.8169779864267366, "grad/layer_20/attn": 0.005261821672320366, "grad/layer_20/mlp": 0.006058970931917429, "grad/layer_20/attn_mlp_ratio": 0.8684348620586166, "grad/layer_24/attn": 0.01448926143348217, "grad/layer_24/mlp": 0.009578603319823742, "grad/layer_24/attn_mlp_ratio": 1.5126695195977526, "grad/layer_27/attn": 0.004426940809935331, "grad/layer_27/mlp": 0.008480777963995934, "grad/layer_27/attn_mlp_ratio": 0.521997012129034} {"step": 2550, "timestamp": 1778328477.4444387, "eos/sharpness": 46.74785137176513, "eos/L0_probe": 2.3848094940185547, "eos/L_plus": 2.586825132369995, "eos/L_minus": 2.6502723693847656, "eos/grad_norm": 0.13821393251419067, "eos/embed_grad_frac": 0.11992250382900238, "eos/time_s": 0.6187059879302979} {"step": 2550, "timestamp": 1778328477.4650943, "train/loss": 2.3982293128967287, "train/z_loss": 0.0013500927132554352, "train/perplexity": 11.00367505485561, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908882.9702634073, "perf/iters_per_sec": 0.9102263308827435, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0986278533935547, "data/tokens_consumed": 5349834752, "data/tokens_consumed_B": 5.349834752, "train/loss_slope": -1.4058043267896407e-05} {"step": 2550, "timestamp": 1778328478.829856, "geo/rankme_last": 424.360107421875, "geo/layer_0/stable_rank_q_proj": 20.279611587524414, "geo/layer_0/stable_rank_k_proj": 17.075817108154297, "geo/layer_0/stable_rank_o_proj": 45.807403564453125, "geo/layer_0/stable_rank_gate_proj": 129.95176696777344, "geo/layer_0/stable_rank_down_proj": 55.87166213989258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07047801464796066, "geo/layer_0/attn_entropy_mean": 6.253416061401367, "geo/layer_0/attn_entropy_std": 0.41645288467407227, "geo/layer_7/stable_rank_q_proj": 42.684417724609375, "geo/layer_7/stable_rank_k_proj": 39.686763763427734, "geo/layer_7/stable_rank_o_proj": 89.09756469726562, "geo/layer_7/stable_rank_gate_proj": 79.17870330810547, "geo/layer_7/stable_rank_down_proj": 143.20436096191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4192180633544922, "geo/layer_7/attn_entropy_mean": 4.73968505859375, "geo/layer_7/attn_entropy_std": 0.7586975693702698, "geo/layer_14/stable_rank_q_proj": 50.8013916015625, "geo/layer_14/stable_rank_k_proj": 41.51578140258789, "geo/layer_14/stable_rank_o_proj": 42.827022552490234, "geo/layer_14/stable_rank_gate_proj": 71.56211853027344, "geo/layer_14/stable_rank_down_proj": 127.70361328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3781391382217407, "geo/layer_14/attn_entropy_mean": 5.533862113952637, "geo/layer_14/attn_entropy_std": 0.4360659718513489, "geo/layer_21/stable_rank_q_proj": 40.05291748046875, "geo/layer_21/stable_rank_k_proj": 29.56195068359375, "geo/layer_21/stable_rank_o_proj": 66.89037322998047, "geo/layer_21/stable_rank_gate_proj": 63.33371353149414, "geo/layer_21/stable_rank_down_proj": 49.96369552612305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14006809890270233, "geo/layer_21/attn_entropy_mean": 5.874951362609863, "geo/layer_21/attn_entropy_std": 0.31291085481643677, "geo/layer_27/stable_rank_q_proj": 43.78575897216797, "geo/layer_27/stable_rank_k_proj": 31.13371467590332, "geo/layer_27/stable_rank_o_proj": 112.69867706298828, "geo/layer_27/stable_rank_gate_proj": 75.03064727783203, "geo/layer_27/stable_rank_down_proj": 127.71304321289062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10324806720018387, "geo/layer_27/attn_entropy_mean": 4.327381610870361, "geo/layer_27/attn_entropy_std": 0.6578112244606018, "attnres/final_alpha/block_0": 0.2503725588321686, "attnres/block_norm/0": 1.7799911499023438, "attnres/final_alpha/block_1": 0.004144440405070782, "attnres/block_norm/1": 49271.49609375, "attnres/final_alpha/block_2": 0.009020961821079254, "attnres/block_norm/2": 29533.15625, "attnres/final_alpha/block_3": 0.010724282823503017, "attnres/block_norm/3": 66809.640625, "attnres/final_alpha/block_4": 0.012375813908874989, "attnres/block_norm/4": 16496.935546875, "attnres/final_alpha/block_5": 0.607811450958252, "attnres/block_norm/5": 6927.64794921875, "attnres/final_alpha/block_6": 0.10555051267147064, "attnres/block_norm/6": 44646.5625, "geo/tier1_time_s": 1.3602566719055176, "geo/step": 2550.0, "geo/rankme_slope": 0.0025046830092714167} {"step": 2560, "timestamp": 1778328489.1826413, "train/loss": 2.40724515914917, "train/z_loss": 0.0013421347481198608, "train/perplexity": 11.103331063942045, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790406.8471728866, "perf/iters_per_sec": 0.853732513033336, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1713270664215087, "data/tokens_consumed": 5370806272, "data/tokens_consumed_B": 5.370806272, "train/loss_slope": -1.5547248689827153e-05} {"step": 2570, "timestamp": 1778328499.5415192, "train/loss": 2.3490798234939576, "train/z_loss": 0.0013690831838175654, "train/perplexity": 10.475925587649085, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025859.0532009697, "perf/iters_per_sec": 0.9660048738484238, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351914644241333, "data/tokens_consumed": 5391777792, "data/tokens_consumed_B": 5.391777792, "train/loss_slope": -1.7518545849488477e-05} {"step": 2580, "timestamp": 1778328509.9140186, "train/loss": 2.393594002723694, "train/z_loss": 0.0013406811398454011, "train/perplexity": 10.952787638526676, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022742.4169883546, "perf/iters_per_sec": 0.9645187458936475, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367864847183228, "data/tokens_consumed": 5412749312, "data/tokens_consumed_B": 5.412749312, "train/loss_slope": -1.9326921214651024e-05} {"step": 2590, "timestamp": 1778328520.27359, "train/loss": 2.4191598653793336, "train/z_loss": 0.001353583112359047, "train/perplexity": 11.236415246832024, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025996.3311354178, "perf/iters_per_sec": 0.96607033306857, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351213216781616, "data/tokens_consumed": 5433720832, "data/tokens_consumed_B": 5.433720832, "train/loss_slope": -1.7914398604243306e-05} {"step": 2600, "timestamp": 1778328530.6145573, "grad/layer_0/attn": 0.0030941248405724764, "grad/layer_0/mlp": 0.003296712413430214, "grad/layer_0/attn_mlp_ratio": 0.9385485777020496, "grad/layer_4/attn": 0.0018929614452645183, "grad/layer_4/mlp": 0.002605053596198559, "grad/layer_4/attn_mlp_ratio": 0.726649684045605, "grad/layer_8/attn": 0.003565693972632289, "grad/layer_8/mlp": 0.003593335160985589, "grad/layer_8/attn_mlp_ratio": 0.9923076233232636, "grad/layer_12/attn": 0.007556368131190538, "grad/layer_12/mlp": 0.0071840048767626286, "grad/layer_12/attn_mlp_ratio": 1.0518322517359544, "grad/layer_16/attn": 0.009338447824120522, "grad/layer_16/mlp": 0.00462657306343317, "grad/layer_16/attn_mlp_ratio": 2.0184372956486185, "grad/layer_20/attn": 0.005775747820734978, "grad/layer_20/mlp": 0.005767917260527611, "grad/layer_20/attn_mlp_ratio": 1.0013575888345687, "grad/layer_24/attn": 0.008897152729332447, "grad/layer_24/mlp": 0.009731142781674862, "grad/layer_24/attn_mlp_ratio": 0.9142967930403183, "grad/layer_27/attn": 0.007537773810327053, "grad/layer_27/mlp": 0.008745357394218445, "grad/layer_27/attn_mlp_ratio": 0.8619171732328009} {"step": 2600, "timestamp": 1778328530.6301367, "train/loss": 2.3811262607574464, "train/z_loss": 0.001339425821788609, "train/perplexity": 10.817078857399672, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025991.0580499724, "perf/iters_per_sec": 0.9660678186654913, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351240158081054, "data/tokens_consumed": 5454692352, "data/tokens_consumed_B": 5.454692352, "train/loss_slope": -1.5933099896064366e-05} {"step": 2610, "timestamp": 1778328541.4790034, "train/loss": 2.4325879573822022, "train/z_loss": 0.0013345127925276756, "train/perplexity": 11.388316453758273, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933967.8382984288, "perf/iters_per_sec": 0.9221877280704636, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.084377908706665, "data/tokens_consumed": 5475663872, "data/tokens_consumed_B": 5.475663872, "train/loss_slope": -1.311631614726255e-05} {"step": 2620, "timestamp": 1778328551.8494186, "train/loss": 2.4010435581207275, "train/z_loss": 0.0013535054284147918, "train/perplexity": 11.034685710152484, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023610.2906001098, "perf/iters_per_sec": 0.9649325802803563, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363418340682984, "data/tokens_consumed": 5496635392, "data/tokens_consumed_B": 5.496635392, "train/loss_slope": -1.369598028433056e-05} {"step": 2625, "timestamp": 1778328557.6165118, "eos/sharpness": 40.14999866485595, "eos/L0_probe": 2.379014492034912, "eos/L_plus": 2.533010482788086, "eos/L_minus": 2.626518487930298, "eos/grad_norm": 0.12857437133789062, "eos/embed_grad_frac": 0.1632879674434662, "eos/time_s": 0.6046097278594971} {"step": 2625, "timestamp": 1778328559.0068967, "geo/rankme_last": 424.74456787109375, "geo/layer_0/stable_rank_q_proj": 20.299846649169922, "geo/layer_0/stable_rank_k_proj": 17.097938537597656, "geo/layer_0/stable_rank_o_proj": 45.77992630004883, "geo/layer_0/stable_rank_gate_proj": 130.11709594726562, "geo/layer_0/stable_rank_down_proj": 55.91117477416992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07419000566005707, "geo/layer_0/attn_entropy_mean": 6.251805305480957, "geo/layer_0/attn_entropy_std": 0.41762906312942505, "geo/layer_7/stable_rank_q_proj": 42.6397590637207, "geo/layer_7/stable_rank_k_proj": 39.672821044921875, "geo/layer_7/stable_rank_o_proj": 89.14080810546875, "geo/layer_7/stable_rank_gate_proj": 79.2226791381836, "geo/layer_7/stable_rank_down_proj": 143.0126190185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42308309674263, "geo/layer_7/attn_entropy_mean": 4.753839015960693, "geo/layer_7/attn_entropy_std": 0.758021354675293, "geo/layer_14/stable_rank_q_proj": 50.80458450317383, "geo/layer_14/stable_rank_k_proj": 41.52619552612305, "geo/layer_14/stable_rank_o_proj": 42.84516525268555, "geo/layer_14/stable_rank_gate_proj": 71.48881530761719, "geo/layer_14/stable_rank_down_proj": 127.88292694091797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37774550914764404, "geo/layer_14/attn_entropy_mean": 5.5345354080200195, "geo/layer_14/attn_entropy_std": 0.43871045112609863, "geo/layer_21/stable_rank_q_proj": 40.018611907958984, "geo/layer_21/stable_rank_k_proj": 29.49701690673828, "geo/layer_21/stable_rank_o_proj": 66.81486511230469, "geo/layer_21/stable_rank_gate_proj": 63.398555755615234, "geo/layer_21/stable_rank_down_proj": 49.951438903808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13939495384693146, "geo/layer_21/attn_entropy_mean": 5.880990028381348, "geo/layer_21/attn_entropy_std": 0.3116549551486969, "geo/layer_27/stable_rank_q_proj": 43.71025466918945, "geo/layer_27/stable_rank_k_proj": 31.158309936523438, "geo/layer_27/stable_rank_o_proj": 112.76145935058594, "geo/layer_27/stable_rank_gate_proj": 74.9294204711914, "geo/layer_27/stable_rank_down_proj": 127.78180694580078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09860611706972122, "geo/layer_27/attn_entropy_mean": 4.324863433837891, "geo/layer_27/attn_entropy_std": 0.6594431400299072, "attnres/final_alpha/block_0": 0.24959895014762878, "attnres/block_norm/0": 1.7799391746520996, "attnres/final_alpha/block_1": 0.004113921895623207, "attnres/block_norm/1": 49349.015625, "attnres/final_alpha/block_2": 0.008924588561058044, "attnres/block_norm/2": 29536.02734375, "attnres/final_alpha/block_3": 0.01074487715959549, "attnres/block_norm/3": 67043.953125, "attnres/final_alpha/block_4": 0.01242885086685419, "attnres/block_norm/4": 16524.96875, "attnres/final_alpha/block_5": 0.6102017164230347, "attnres/block_norm/5": 6950.8330078125, "attnres/final_alpha/block_6": 0.10398705303668976, "attnres/block_norm/6": 44845.96875, "geo/tier1_time_s": 1.362825632095337, "geo/step": 2625.0, "geo/rankme_slope": 0.002475991424599823} {"step": 2630, "timestamp": 1778328564.1906893, "train/loss": 2.4299633741378783, "train/z_loss": 0.0013527916162274778, "train/perplexity": 11.358466058771153, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699889.075914048, "perf/iters_per_sec": 0.8105702762193908, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2336993217468262, "data/tokens_consumed": 5517606912, "data/tokens_consumed_B": 5.517606912, "train/loss_slope": -1.134615714150146e-05} {"step": 2640, "timestamp": 1778328574.5408998, "train/loss": 2.4361393451690674, "train/z_loss": 0.0013484133058227598, "train/perplexity": 11.428832683563206, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027084.2000085958, "perf/iters_per_sec": 0.9665890693705539, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034565806388855, "data/tokens_consumed": 5538578432, "data/tokens_consumed_B": 5.538578432, "train/loss_slope": -8.68539110590589e-06} {"step": 2650, "timestamp": 1778328584.8800886, "grad/layer_0/attn": 0.00399226788431406, "grad/layer_0/mlp": 0.004065772984176874, "grad/layer_0/attn_mlp_ratio": 0.9819209782885129, "grad/layer_4/attn": 0.002276630839332938, "grad/layer_4/mlp": 0.002792378654703498, "grad/layer_4/attn_mlp_ratio": 0.8153015902653495, "grad/layer_8/attn": 0.007553301751613617, "grad/layer_8/mlp": 0.0038403738290071487, "grad/layer_8/attn_mlp_ratio": 1.9668141413423172, "grad/layer_12/attn": 0.01000667829066515, "grad/layer_12/mlp": 0.00786371249705553, "grad/layer_12/attn_mlp_ratio": 1.2725132266929524, "grad/layer_16/attn": 0.005187688395380974, "grad/layer_16/mlp": 0.005089334212243557, "grad/layer_16/attn_mlp_ratio": 1.0193255300405013, "grad/layer_20/attn": 0.003247882705181837, "grad/layer_20/mlp": 0.00597275048494339, "grad/layer_20/attn_mlp_ratio": 0.5437834141893305, "grad/layer_24/attn": 0.006765519268810749, "grad/layer_24/mlp": 0.00930398516356945, "grad/layer_24/attn_mlp_ratio": 0.7271635839000862, "grad/layer_27/attn": 0.006353947799652815, "grad/layer_27/mlp": 0.010049182921648026, "grad/layer_27/attn_mlp_ratio": 0.6322850112257975} {"step": 2650, "timestamp": 1778328584.895704, "train/loss": 2.392133092880249, "train/z_loss": 0.0013596747419796884, "train/perplexity": 10.936798285597716, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026583.4018049857, "perf/iters_per_sec": 0.9663502701783112, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348214626312255, "data/tokens_consumed": 5559549952, "data/tokens_consumed_B": 5.559549952, "train/loss_slope": -7.64328548581324e-06} {"step": 2660, "timestamp": 1778328595.2440572, "train/loss": 2.367817735671997, "train/z_loss": 0.0013571511371992528, "train/perplexity": 10.67407319998384, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027564.7748249008, "perf/iters_per_sec": 0.9668182253002647, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034320592880249, "data/tokens_consumed": 5580521472, "data/tokens_consumed_B": 5.580521472, "train/loss_slope": -6.812276448210572e-06} {"step": 2670, "timestamp": 1778328605.5945542, "train/loss": 2.4264119625091554, "train/z_loss": 0.00136056796181947, "train/perplexity": 11.318199015071617, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027424.4337067688, "perf/iters_per_sec": 0.9667513054403156, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343921899795532, "data/tokens_consumed": 5601492992, "data/tokens_consumed_B": 5.601492992, "train/loss_slope": -6.1963252323080205e-06} {"step": 2680, "timestamp": 1778328615.9504166, "train/loss": 2.3649183988571165, "train/z_loss": 0.0013545133522711694, "train/perplexity": 10.643170287214618, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026544.9753120544, "perf/iters_per_sec": 0.9663319469986221, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348410844802856, "data/tokens_consumed": 5622464512, "data/tokens_consumed_B": 5.622464512, "train/loss_slope": -7.1631750901682575e-06} {"step": 2690, "timestamp": 1778328626.2942076, "train/loss": 2.441254138946533, "train/z_loss": 0.0013491008896380663, "train/perplexity": 11.487438556567882, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028331.8300580287, "perf/iters_per_sec": 0.9671839857378143, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033929443359375, "data/tokens_consumed": 5643436032, "data/tokens_consumed_B": 5.643436032, "train/loss_slope": -7.313552247558677e-06} {"step": 2700, "timestamp": 1778328636.6345303, "grad/layer_0/attn": 0.0026930489111691713, "grad/layer_0/mlp": 0.002916374709457159, "grad/layer_0/attn_mlp_ratio": 0.9234234579301017, "grad/layer_4/attn": 0.0033037501852959394, "grad/layer_4/mlp": 0.002658819081261754, "grad/layer_4/attn_mlp_ratio": 1.242562942444296, "grad/layer_8/attn": 0.002949819900095463, "grad/layer_8/mlp": 0.0035037784837186337, "grad/layer_8/attn_mlp_ratio": 0.8418967778964951, "grad/layer_12/attn": 0.008033850230276585, "grad/layer_12/mlp": 0.0073511675000190735, "grad/layer_12/attn_mlp_ratio": 1.0928672378869093, "grad/layer_16/attn": 0.008639534004032612, "grad/layer_16/mlp": 0.00508461520075798, "grad/layer_16/attn_mlp_ratio": 1.699151950147475, "grad/layer_20/attn": 0.002920848084613681, "grad/layer_20/mlp": 0.006020667962729931, "grad/layer_20/attn_mlp_ratio": 0.48513687421081486, "grad/layer_24/attn": 0.010080652311444283, "grad/layer_24/mlp": 0.009297413751482964, "grad/layer_24/attn_mlp_ratio": 1.0842426154705795, "grad/layer_27/attn": 0.004646276589483023, "grad/layer_27/mlp": 0.009453359991312027, "grad/layer_27/attn_mlp_ratio": 0.491494721940521} {"step": 2700, "timestamp": 1778328637.2282133, "eos/sharpness": 43.96471977233886, "eos/L0_probe": 2.3857080936431885, "eos/L_plus": 2.634462594985962, "eos/L_minus": 2.5766007900238037, "eos/grad_norm": 0.13837894797325134, "eos/embed_grad_frac": 0.11950884759426117, "eos/time_s": 0.591010570526123} {"step": 2700, "timestamp": 1778328637.2492478, "train/loss": 2.390488338470459, "train/z_loss": 0.0013526377384550869, "train/perplexity": 10.91882472348813, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915589.100510851, "perf/iters_per_sec": 0.9134240629724746, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.094781756401062, "data/tokens_consumed": 5664407552, "data/tokens_consumed_B": 5.664407552, "train/loss_slope": -7.19222590403259e-06} {"step": 2700, "timestamp": 1778328638.6099367, "geo/rankme_last": 424.84075927734375, "geo/layer_0/stable_rank_q_proj": 20.324811935424805, "geo/layer_0/stable_rank_k_proj": 17.09904670715332, "geo/layer_0/stable_rank_o_proj": 45.71779251098633, "geo/layer_0/stable_rank_gate_proj": 130.06333923339844, "geo/layer_0/stable_rank_down_proj": 55.914878845214844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06458227336406708, "geo/layer_0/attn_entropy_mean": 6.25242805480957, "geo/layer_0/attn_entropy_std": 0.420268714427948, "geo/layer_7/stable_rank_q_proj": 42.60957717895508, "geo/layer_7/stable_rank_k_proj": 39.69385528564453, "geo/layer_7/stable_rank_o_proj": 89.11077880859375, "geo/layer_7/stable_rank_gate_proj": 79.22957611083984, "geo/layer_7/stable_rank_down_proj": 142.99734497070312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42771583795547485, "geo/layer_7/attn_entropy_mean": 4.746216773986816, "geo/layer_7/attn_entropy_std": 0.7829387784004211, "geo/layer_14/stable_rank_q_proj": 50.87284469604492, "geo/layer_14/stable_rank_k_proj": 41.4748649597168, "geo/layer_14/stable_rank_o_proj": 42.87975311279297, "geo/layer_14/stable_rank_gate_proj": 71.50054168701172, "geo/layer_14/stable_rank_down_proj": 127.9563217163086, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3830524682998657, "geo/layer_14/attn_entropy_mean": 5.547285556793213, "geo/layer_14/attn_entropy_std": 0.4214746654033661, "geo/layer_21/stable_rank_q_proj": 40.024559020996094, "geo/layer_21/stable_rank_k_proj": 29.47492218017578, "geo/layer_21/stable_rank_o_proj": 66.77354431152344, "geo/layer_21/stable_rank_gate_proj": 63.47642135620117, "geo/layer_21/stable_rank_down_proj": 49.943660736083984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1376413106918335, "geo/layer_21/attn_entropy_mean": 5.881540298461914, "geo/layer_21/attn_entropy_std": 0.31139689683914185, "geo/layer_27/stable_rank_q_proj": 43.697933197021484, "geo/layer_27/stable_rank_k_proj": 30.997514724731445, "geo/layer_27/stable_rank_o_proj": 112.72716522216797, "geo/layer_27/stable_rank_gate_proj": 74.88526153564453, "geo/layer_27/stable_rank_down_proj": 127.69728088378906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11078425496816635, "geo/layer_27/attn_entropy_mean": 4.328902721405029, "geo/layer_27/attn_entropy_std": 0.6558698415756226, "attnres/final_alpha/block_0": 0.24684852361679077, "attnres/block_norm/0": 1.77964448928833, "attnres/final_alpha/block_1": 0.004014817997813225, "attnres/block_norm/1": 49289.078125, "attnres/final_alpha/block_2": 0.008568059653043747, "attnres/block_norm/2": 29706.17578125, "attnres/final_alpha/block_3": 0.010445734485983849, "attnres/block_norm/3": 67439.3359375, "attnres/final_alpha/block_4": 0.012157605960965157, "attnres/block_norm/4": 16436.416015625, "attnres/final_alpha/block_5": 0.6152462959289551, "attnres/block_norm/5": 6883.048828125, "attnres/final_alpha/block_6": 0.10271893441677094, "attnres/block_norm/6": 44930.50390625, "geo/tier1_time_s": 1.356546401977539, "geo/step": 2700.0, "geo/rankme_slope": 0.002446614737681761} {"step": 2710, "timestamp": 1778328648.9554367, "train/loss": 2.3941961765289306, "train/z_loss": 0.0013451552717015148, "train/perplexity": 10.959385106548767, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792113.711836949, "perf/iters_per_sec": 0.8545464095291848, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1702114582061767, "data/tokens_consumed": 5685379072, "data/tokens_consumed_B": 5.685379072, "train/loss_slope": -5.835940449437331e-06} {"step": 2720, "timestamp": 1778328659.302421, "train/loss": 2.3788140296936033, "train/z_loss": 0.001354696019552648, "train/perplexity": 10.792096165654154, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027630.3487798513, "perf/iters_per_sec": 0.9668494933985955, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342871427536011, "data/tokens_consumed": 5706350592, "data/tokens_consumed_B": 5.706350592, "train/loss_slope": -3.567329770696742e-06} {"step": 2730, "timestamp": 1778328669.6487887, "train/loss": 2.361094045639038, "train/z_loss": 0.0013674673158675433, "train/perplexity": 10.602544777340484, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028149.6217683912, "perf/iters_per_sec": 0.9670971020547825, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034022331237793, "data/tokens_consumed": 5727322112, "data/tokens_consumed_B": 5.727322112, "train/loss_slope": -5.490330722716082e-06} {"step": 2740, "timestamp": 1778328679.9935484, "train/loss": 2.3622426271438597, "train/z_loss": 0.0013446386437863111, "train/perplexity": 10.614729660501936, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028177.6804607115, "perf/iters_per_sec": 0.9671104814818914, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340080261230469, "data/tokens_consumed": 5748293632, "data/tokens_consumed_B": 5.748293632, "train/loss_slope": -8.676234916849485e-06} {"step": 2750, "timestamp": 1778328690.3327582, "grad/layer_0/attn": 0.00295428279787302, "grad/layer_0/mlp": 0.0031410737428814173, "grad/layer_0/attn_mlp_ratio": 0.9405327431471502, "grad/layer_4/attn": 0.002637272933498025, "grad/layer_4/mlp": 0.002573298057541251, "grad/layer_4/attn_mlp_ratio": 1.0248610040656543, "grad/layer_8/attn": 0.004177282564342022, "grad/layer_8/mlp": 0.003463813103735447, "grad/layer_8/attn_mlp_ratio": 1.2059780128550683, "grad/layer_12/attn": 0.007497786078602076, "grad/layer_12/mlp": 0.007096611894667149, "grad/layer_12/attn_mlp_ratio": 1.0565303674818907, "grad/layer_16/attn": 0.003936646040529013, "grad/layer_16/mlp": 0.00458815973252058, "grad/layer_16/attn_mlp_ratio": 0.8580010688874263, "grad/layer_20/attn": 0.0032072551548480988, "grad/layer_20/mlp": 0.006497826427221298, "grad/layer_20/attn_mlp_ratio": 0.4935889164495186, "grad/layer_24/attn": 0.01397852972149849, "grad/layer_24/mlp": 0.011724591255187988, "grad/layer_24/attn_mlp_ratio": 1.1922402494064888, "grad/layer_27/attn": 0.006486229132860899, "grad/layer_27/mlp": 0.011848334223031998, "grad/layer_27/attn_mlp_ratio": 0.5474380580443537} {"step": 2750, "timestamp": 1778328690.348451, "train/loss": 2.419442081451416, "train/z_loss": 0.0013473847648128867, "train/perplexity": 11.239586791316635, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026474.3826853188, "perf/iters_per_sec": 0.9662982858110994, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348771333694458, "data/tokens_consumed": 5769265152, "data/tokens_consumed_B": 5.769265152, "train/loss_slope": -8.342170486427256e-06} {"step": 2760, "timestamp": 1778328700.7210538, "train/loss": 2.3771774530410767, "train/z_loss": 0.0013468313380144536, "train/perplexity": 10.774448517840746, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023210.836651627, "perf/iters_per_sec": 0.9647421057947287, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365464448928834, "data/tokens_consumed": 5790236672, "data/tokens_consumed_B": 5.790236672, "train/loss_slope": -9.790598462731655e-06} {"step": 2770, "timestamp": 1778328711.1015978, "train/loss": 2.3447230100631713, "train/z_loss": 0.001363438309635967, "train/perplexity": 10.430383216196452, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021776.3006808346, "perf/iters_per_sec": 0.9640580657390759, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372819185256958, "data/tokens_consumed": 5811208192, "data/tokens_consumed_B": 5.811208192, "train/loss_slope": -1.369351778451004e-05} {"step": 2775, "timestamp": 1778328716.8881807, "eos/sharpness": 71.3593006134033, "eos/L0_probe": 2.3833625316619873, "eos/L_plus": 2.699047803878784, "eos/L_minus": 2.7812702655792236, "eos/grad_norm": 0.32280176877975464, "eos/embed_grad_frac": 0.025058038532733917, "eos/time_s": 0.6012263298034668} {"step": 2775, "timestamp": 1778328718.2734973, "geo/rankme_last": 424.3502197265625, "geo/layer_0/stable_rank_q_proj": 20.3253116607666, "geo/layer_0/stable_rank_k_proj": 17.1049861907959, "geo/layer_0/stable_rank_o_proj": 45.71326446533203, "geo/layer_0/stable_rank_gate_proj": 129.83070373535156, "geo/layer_0/stable_rank_down_proj": 55.96720504760742, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06517571955919266, "geo/layer_0/attn_entropy_mean": 6.258632659912109, "geo/layer_0/attn_entropy_std": 0.41988760232925415, "geo/layer_7/stable_rank_q_proj": 42.59364318847656, "geo/layer_7/stable_rank_k_proj": 39.60338592529297, "geo/layer_7/stable_rank_o_proj": 89.24541473388672, "geo/layer_7/stable_rank_gate_proj": 79.35270690917969, "geo/layer_7/stable_rank_down_proj": 143.24693298339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4266006350517273, "geo/layer_7/attn_entropy_mean": 4.712655067443848, "geo/layer_7/attn_entropy_std": 0.7621049880981445, "geo/layer_14/stable_rank_q_proj": 50.902732849121094, "geo/layer_14/stable_rank_k_proj": 41.483150482177734, "geo/layer_14/stable_rank_o_proj": 42.878440856933594, "geo/layer_14/stable_rank_gate_proj": 71.46617126464844, "geo/layer_14/stable_rank_down_proj": 127.82599639892578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3556259870529175, "geo/layer_14/attn_entropy_mean": 5.504323959350586, "geo/layer_14/attn_entropy_std": 0.4238792359828949, "geo/layer_21/stable_rank_q_proj": 39.97917175292969, "geo/layer_21/stable_rank_k_proj": 29.452024459838867, "geo/layer_21/stable_rank_o_proj": 66.75129699707031, "geo/layer_21/stable_rank_gate_proj": 63.389434814453125, "geo/layer_21/stable_rank_down_proj": 49.948158264160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13242891430854797, "geo/layer_21/attn_entropy_mean": 5.8550496101379395, "geo/layer_21/attn_entropy_std": 0.3055104911327362, "geo/layer_27/stable_rank_q_proj": 43.66350555419922, "geo/layer_27/stable_rank_k_proj": 30.963123321533203, "geo/layer_27/stable_rank_o_proj": 112.54954528808594, "geo/layer_27/stable_rank_gate_proj": 74.773193359375, "geo/layer_27/stable_rank_down_proj": 127.60074615478516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1121537834405899, "geo/layer_27/attn_entropy_mean": 4.319706916809082, "geo/layer_27/attn_entropy_std": 0.6574788093566895, "attnres/final_alpha/block_0": 0.25267302989959717, "attnres/block_norm/0": 1.7797484397888184, "attnres/final_alpha/block_1": 0.004231352359056473, "attnres/block_norm/1": 49270.55859375, "attnres/final_alpha/block_2": 0.008966336958110332, "attnres/block_norm/2": 29473.0078125, "attnres/final_alpha/block_3": 0.010809943079948425, "attnres/block_norm/3": 67112.375, "attnres/final_alpha/block_4": 0.012648792937397957, "attnres/block_norm/4": 16553.33203125, "attnres/final_alpha/block_5": 0.6037632822990417, "attnres/block_norm/5": 7063.9619140625, "attnres/final_alpha/block_6": 0.10690724849700928, "attnres/block_norm/6": 44999.6796875, "geo/tier1_time_s": 1.3642632961273193, "geo/step": 2775.0, "geo/rankme_slope": 0.002385253416490567} {"step": 2780, "timestamp": 1778328723.465467, "train/loss": 2.3861316442489624, "train/z_loss": 0.0013510894379578531, "train/perplexity": 10.871358216517363, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697047.5393645847, "perf/iters_per_sec": 0.8092153260062145, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.235765027999878, "data/tokens_consumed": 5832179712, "data/tokens_consumed_B": 5.832179712, "train/loss_slope": -1.3531788283198407e-05} {"step": 2790, "timestamp": 1778328733.810702, "train/loss": 2.3549478530883787, "train/z_loss": 0.0013618208700791, "train/perplexity": 10.537579345148991, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028178.1948773179, "perf/iters_per_sec": 0.9671107267748441, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03400776386261, "data/tokens_consumed": 5853151232, "data/tokens_consumed_B": 5.853151232, "train/loss_slope": -1.6650819420778716e-05} {"step": 2800, "timestamp": 1778328744.1489093, "grad/layer_0/attn": 0.0035839001648128033, "grad/layer_0/mlp": 0.0035551409237086773, "grad/layer_0/attn_mlp_ratio": 1.0080894515610874, "grad/layer_4/attn": 0.003076936351135373, "grad/layer_4/mlp": 0.0027088955976068974, "grad/layer_4/attn_mlp_ratio": 1.1358637225691677, "grad/layer_8/attn": 0.003764278953894973, "grad/layer_8/mlp": 0.00352209503762424, "grad/layer_8/attn_mlp_ratio": 1.0687612931529422, "grad/layer_12/attn": 0.00719167897477746, "grad/layer_12/mlp": 0.007151253521442413, "grad/layer_12/attn_mlp_ratio": 1.0056529044381584, "grad/layer_16/attn": 0.003932145889848471, "grad/layer_16/mlp": 0.004971837624907494, "grad/layer_16/attn_mlp_ratio": 0.7908837953719885, "grad/layer_20/attn": 0.005695317406207323, "grad/layer_20/mlp": 0.007660508155822754, "grad/layer_20/attn_mlp_ratio": 0.7434646913771434, "grad/layer_24/attn": 0.01627170853316784, "grad/layer_24/mlp": 0.012583318166434765, "grad/layer_24/attn_mlp_ratio": 1.2931174582599274, "grad/layer_27/attn": 0.005873644258826971, "grad/layer_27/mlp": 0.012344704940915108, "grad/layer_27/attn_mlp_ratio": 0.47580272184385547} {"step": 2800, "timestamp": 1778328744.1647062, "train/loss": 2.420959162712097, "train/z_loss": 0.0013611354399472475, "train/perplexity": 11.256651098513034, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026423.5889171837, "perf/iters_per_sec": 0.9662740654550475, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034903073310852, "data/tokens_consumed": 5874122752, "data/tokens_consumed_B": 5.874122752, "train/loss_slope": -1.2792770132230181e-05} {"step": 2810, "timestamp": 1778328754.5198255, "train/loss": 2.383006978034973, "train/z_loss": 0.0013655751477926969, "train/perplexity": 10.837441867029003, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026194.6750775739, "perf/iters_per_sec": 0.9661649108302945, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350199937820435, "data/tokens_consumed": 5895094272, "data/tokens_consumed_B": 5.895094272, "train/loss_slope": -1.4768140637191442e-05} {"step": 2820, "timestamp": 1778328764.8662984, "train/loss": 2.397381043434143, "train/z_loss": 0.00135221763048321, "train/perplexity": 10.994344931119352, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028311.811735546, "perf/iters_per_sec": 0.9671744402578096, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339396476745606, "data/tokens_consumed": 5916065792, "data/tokens_consumed_B": 5.916065792, "train/loss_slope": -1.4851091503918121e-05} {"step": 2830, "timestamp": 1778328775.212647, "train/loss": 2.432720589637756, "train/z_loss": 0.001359429198782891, "train/perplexity": 11.389827012028658, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027889.6948968936, "perf/iters_per_sec": 0.9669731592640369, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341548681259156, "data/tokens_consumed": 5937037312, "data/tokens_consumed_B": 5.937037312, "train/loss_slope": -1.3798632394291036e-05} {"step": 2840, "timestamp": 1778328785.5676563, "train/loss": 2.375767707824707, "train/z_loss": 0.0013611975591629744, "train/perplexity": 10.75926999202457, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026615.152619142, "perf/iters_per_sec": 0.9663654101463042, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348052501678466, "data/tokens_consumed": 5958008832, "data/tokens_consumed_B": 5.958008832, "train/loss_slope": -1.7102011109199565e-05} {"step": 2850, "timestamp": 1778328795.912264, "grad/layer_0/attn": 0.0030896489042788744, "grad/layer_0/mlp": 0.003378674853593111, "grad/layer_0/attn_mlp_ratio": 0.9144557990087603, "grad/layer_4/attn": 0.0024440214037895203, "grad/layer_4/mlp": 0.0025228685699403286, "grad/layer_4/attn_mlp_ratio": 0.9687469795434602, "grad/layer_8/attn": 0.003741068998351693, "grad/layer_8/mlp": 0.003376978449523449, "grad/layer_8/attn_mlp_ratio": 1.107815446112213, "grad/layer_12/attn": 0.005767356604337692, "grad/layer_12/mlp": 0.007222585380077362, "grad/layer_12/attn_mlp_ratio": 0.7985169050953091, "grad/layer_16/attn": 0.003946981858462095, "grad/layer_16/mlp": 0.005025046877563, "grad/layer_16/attn_mlp_ratio": 0.7854616834599755, "grad/layer_20/attn": 0.0037242043763399124, "grad/layer_20/mlp": 0.006528202909976244, "grad/layer_20/attn_mlp_ratio": 0.5704792529657353, "grad/layer_24/attn": 0.01632130704820156, "grad/layer_24/mlp": 0.010812905617058277, "grad/layer_24/attn_mlp_ratio": 1.509428406691211, "grad/layer_27/attn": 0.012700480408966541, "grad/layer_27/mlp": 0.011235963553190231, "grad/layer_27/attn_mlp_ratio": 1.1303418915350878} {"step": 2850, "timestamp": 1778328796.51892, "eos/sharpness": 74.49104785919188, "eos/L0_probe": 2.389754295349121, "eos/L_plus": 2.689666986465454, "eos/L_minus": 2.834752082824707, "eos/grad_norm": 0.22723355889320374, "eos/embed_grad_frac": 0.04262152686715126, "eos/time_s": 0.6037452220916748} {"step": 2850, "timestamp": 1778328796.5383065, "train/loss": 2.3580448389053346, "train/z_loss": 0.0013523063040338456, "train/perplexity": 10.570264665788573, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913276.3009756256, "perf/iters_per_sec": 0.9123212342146042, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.096105146408081, "data/tokens_consumed": 5978980352, "data/tokens_consumed_B": 5.978980352, "train/loss_slope": -2.0134744678500705e-05} {"step": 2850, "timestamp": 1778328797.9016216, "geo/rankme_last": 424.7678527832031, "geo/layer_0/stable_rank_q_proj": 20.343082427978516, "geo/layer_0/stable_rank_k_proj": 17.116710662841797, "geo/layer_0/stable_rank_o_proj": 45.72506332397461, "geo/layer_0/stable_rank_gate_proj": 130.072509765625, "geo/layer_0/stable_rank_down_proj": 55.92181396484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06748936325311661, "geo/layer_0/attn_entropy_mean": 6.254363059997559, "geo/layer_0/attn_entropy_std": 0.42481598258018494, "geo/layer_7/stable_rank_q_proj": 42.58887481689453, "geo/layer_7/stable_rank_k_proj": 39.57716369628906, "geo/layer_7/stable_rank_o_proj": 89.3143539428711, "geo/layer_7/stable_rank_gate_proj": 79.31880187988281, "geo/layer_7/stable_rank_down_proj": 143.0937957763672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42074495553970337, "geo/layer_7/attn_entropy_mean": 4.738165855407715, "geo/layer_7/attn_entropy_std": 0.7630527019500732, "geo/layer_14/stable_rank_q_proj": 50.972774505615234, "geo/layer_14/stable_rank_k_proj": 41.46620559692383, "geo/layer_14/stable_rank_o_proj": 42.843814849853516, "geo/layer_14/stable_rank_gate_proj": 71.52168273925781, "geo/layer_14/stable_rank_down_proj": 127.90300750732422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38025152683258057, "geo/layer_14/attn_entropy_mean": 5.525572299957275, "geo/layer_14/attn_entropy_std": 0.42315810918807983, "geo/layer_21/stable_rank_q_proj": 39.963706970214844, "geo/layer_21/stable_rank_k_proj": 29.525711059570312, "geo/layer_21/stable_rank_o_proj": 66.72109985351562, "geo/layer_21/stable_rank_gate_proj": 63.41836929321289, "geo/layer_21/stable_rank_down_proj": 49.954017639160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13703612983226776, "geo/layer_21/attn_entropy_mean": 5.8543925285339355, "geo/layer_21/attn_entropy_std": 0.3197415769100189, "geo/layer_27/stable_rank_q_proj": 43.692527770996094, "geo/layer_27/stable_rank_k_proj": 31.061859130859375, "geo/layer_27/stable_rank_o_proj": 112.3062744140625, "geo/layer_27/stable_rank_gate_proj": 74.81873321533203, "geo/layer_27/stable_rank_down_proj": 127.33306121826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09217289835214615, "geo/layer_27/attn_entropy_mean": 4.324355125427246, "geo/layer_27/attn_entropy_std": 0.6571962237358093, "attnres/final_alpha/block_0": 0.25113409757614136, "attnres/block_norm/0": 1.7795131206512451, "attnres/final_alpha/block_1": 0.004114534705877304, "attnres/block_norm/1": 49238.80078125, "attnres/final_alpha/block_2": 0.008997106924653053, "attnres/block_norm/2": 29544.546875, "attnres/final_alpha/block_3": 0.010650687851011753, "attnres/block_norm/3": 67402.8125, "attnres/final_alpha/block_4": 0.012369182892143726, "attnres/block_norm/4": 16529.84375, "attnres/final_alpha/block_5": 0.606932520866394, "attnres/block_norm/5": 6923.744140625, "attnres/final_alpha/block_6": 0.10580191016197205, "attnres/block_norm/6": 44937.01953125, "geo/tier1_time_s": 1.3590795993804932, "geo/step": 2850.0, "geo/rankme_slope": 0.0023451033166223853} {"step": 2860, "timestamp": 1778328808.249933, "train/loss": 2.4185001134872435, "train/z_loss": 0.0013429959653876722, "train/perplexity": 11.229004445526517, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791246.2668169132, "perf/iters_per_sec": 0.8541327795109335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1707781553268433, "data/tokens_consumed": 5999951872, "data/tokens_consumed_B": 5.999951872, "train/loss_slope": -1.6196725477944787e-05} {"step": 2870, "timestamp": 1778328818.6331213, "train/loss": 2.4169015645980836, "train/z_loss": 0.001353470329195261, "train/perplexity": 11.21106867236824, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021172.737510044, "perf/iters_per_sec": 0.96377026439192, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375916719436646, "data/tokens_consumed": 6020923392, "data/tokens_consumed_B": 6.020923392, "train/loss_slope": -1.4662837839112236e-05} {"step": 2880, "timestamp": 1778328829.013238, "train/loss": 2.371509075164795, "train/z_loss": 0.0013636694406159223, "train/perplexity": 10.713547639892274, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021609.9968081147, "perf/iters_per_sec": 0.96397876587301, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373672485351562, "data/tokens_consumed": 6041894912, "data/tokens_consumed_B": 6.041894912, "train/loss_slope": -1.6180533716137076e-05} {"step": 2890, "timestamp": 1778328839.4017825, "train/loss": 2.397367763519287, "train/z_loss": 0.001349344663321972, "train/perplexity": 10.994198928124222, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019694.3424261862, "perf/iters_per_sec": 0.9630653106814319, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383511781692505, "data/tokens_consumed": 6062866432, "data/tokens_consumed_B": 6.062866432, "train/loss_slope": -1.6084533377234494e-05} {"step": 2900, "timestamp": 1778328849.7730696, "grad/layer_0/attn": 0.0031791022047400475, "grad/layer_0/mlp": 0.003399171167984605, "grad/layer_0/attn_mlp_ratio": 0.9352580244139808, "grad/layer_4/attn": 0.0024580664467066526, "grad/layer_4/mlp": 0.0026083849370479584, "grad/layer_4/attn_mlp_ratio": 0.9423710118689268, "grad/layer_8/attn": 0.004579409956932068, "grad/layer_8/mlp": 0.0035464405082166195, "grad/layer_8/attn_mlp_ratio": 1.2912693212237079, "grad/layer_12/attn": 0.006594965700060129, "grad/layer_12/mlp": 0.007864967919886112, "grad/layer_12/attn_mlp_ratio": 0.838524159714972, "grad/layer_16/attn": 0.006326501257717609, "grad/layer_16/mlp": 0.005256316624581814, "grad/layer_16/attn_mlp_ratio": 1.2035997047382907, "grad/layer_20/attn": 0.0029876017943024635, "grad/layer_20/mlp": 0.006195640657097101, "grad/layer_20/attn_mlp_ratio": 0.4822103009894769, "grad/layer_24/attn": 0.00623090798035264, "grad/layer_24/mlp": 0.008543670177459717, "grad/layer_24/attn_mlp_ratio": 0.7293010823218791, "grad/layer_27/attn": 0.005762794520705938, "grad/layer_27/mlp": 0.008049589581787586, "grad/layer_27/attn_mlp_ratio": 0.7159115866171932} {"step": 2900, "timestamp": 1778328849.788155, "train/loss": 2.399208498001099, "train/z_loss": 0.0013613005867227912, "train/perplexity": 11.014454966465772, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020559.648262785, "perf/iters_per_sec": 0.9634779206575322, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379065036773683, "data/tokens_consumed": 6083837952, "data/tokens_consumed_B": 6.083837952, "train/loss_slope": -1.3241855596730606e-05} {"step": 2910, "timestamp": 1778328860.1718247, "train/loss": 2.3490786790847777, "train/z_loss": 0.0013545943307690322, "train/perplexity": 10.475913598910536, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020820.1599371724, "perf/iters_per_sec": 0.9636021423040259, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377727031707764, "data/tokens_consumed": 6104809472, "data/tokens_consumed_B": 6.104809472, "train/loss_slope": -1.5368136049139564e-05} {"step": 2920, "timestamp": 1778328870.5526645, "train/loss": 2.411690831184387, "train/z_loss": 0.001352780673187226, "train/perplexity": 11.152802718246022, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021047.7680497693, "perf/iters_per_sec": 0.9637106743096205, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376558303833008, "data/tokens_consumed": 6125780992, "data/tokens_consumed_B": 6.125780992, "train/loss_slope": -1.276688157993502e-05} {"step": 2925, "timestamp": 1778328876.3365133, "eos/sharpness": 68.31552982330321, "eos/L0_probe": 2.3848636150360107, "eos/L_plus": 2.8070731163024902, "eos/L_minus": 2.6458094120025635, "eos/grad_norm": 0.20448057353496552, "eos/embed_grad_frac": 0.058910105377435684, "eos/time_s": 0.6000115871429443} {"step": 2925, "timestamp": 1778328877.7192955, "geo/rankme_last": 425.4018859863281, "geo/layer_0/stable_rank_q_proj": 20.382429122924805, "geo/layer_0/stable_rank_k_proj": 17.151269912719727, "geo/layer_0/stable_rank_o_proj": 45.7425651550293, "geo/layer_0/stable_rank_gate_proj": 130.18431091308594, "geo/layer_0/stable_rank_down_proj": 55.988929748535156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06645257025957108, "geo/layer_0/attn_entropy_mean": 6.252973556518555, "geo/layer_0/attn_entropy_std": 0.41980886459350586, "geo/layer_7/stable_rank_q_proj": 42.67582321166992, "geo/layer_7/stable_rank_k_proj": 39.65454864501953, "geo/layer_7/stable_rank_o_proj": 89.34439086914062, "geo/layer_7/stable_rank_gate_proj": 79.36930084228516, "geo/layer_7/stable_rank_down_proj": 143.0002899169922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4107027053833008, "geo/layer_7/attn_entropy_mean": 4.705906867980957, "geo/layer_7/attn_entropy_std": 0.7773768305778503, "geo/layer_14/stable_rank_q_proj": 51.11383056640625, "geo/layer_14/stable_rank_k_proj": 41.37803268432617, "geo/layer_14/stable_rank_o_proj": 42.885589599609375, "geo/layer_14/stable_rank_gate_proj": 71.47933197021484, "geo/layer_14/stable_rank_down_proj": 127.80479431152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3771195709705353, "geo/layer_14/attn_entropy_mean": 5.571107864379883, "geo/layer_14/attn_entropy_std": 0.42841655015945435, "geo/layer_21/stable_rank_q_proj": 39.88703536987305, "geo/layer_21/stable_rank_k_proj": 29.529272079467773, "geo/layer_21/stable_rank_o_proj": 66.70309448242188, "geo/layer_21/stable_rank_gate_proj": 63.38760757446289, "geo/layer_21/stable_rank_down_proj": 49.920082092285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14081797003746033, "geo/layer_21/attn_entropy_mean": 5.876615524291992, "geo/layer_21/attn_entropy_std": 0.31567224860191345, "geo/layer_27/stable_rank_q_proj": 43.6501350402832, "geo/layer_27/stable_rank_k_proj": 31.115039825439453, "geo/layer_27/stable_rank_o_proj": 112.32073974609375, "geo/layer_27/stable_rank_gate_proj": 74.84423828125, "geo/layer_27/stable_rank_down_proj": 127.41194152832031, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10002260655164719, "geo/layer_27/attn_entropy_mean": 4.329195976257324, "geo/layer_27/attn_entropy_std": 0.6522896885871887, "attnres/final_alpha/block_0": 0.249497652053833, "attnres/block_norm/0": 1.7792432308197021, "attnres/final_alpha/block_1": 0.004032092168927193, "attnres/block_norm/1": 49310.890625, "attnres/final_alpha/block_2": 0.008736463263630867, "attnres/block_norm/2": 29660.9921875, "attnres/final_alpha/block_3": 0.010765441693365574, "attnres/block_norm/3": 67428.8359375, "attnres/final_alpha/block_4": 0.012298212386667728, "attnres/block_norm/4": 16513.181640625, "attnres/final_alpha/block_5": 0.6133132576942444, "attnres/block_norm/5": 6896.3359375, "attnres/final_alpha/block_6": 0.10135689377784729, "attnres/block_norm/6": 45378.8671875, "geo/tier1_time_s": 1.3616864681243896, "geo/step": 2925.0, "geo/rankme_slope": 0.002333030753168484} {"step": 2930, "timestamp": 1778328882.9113998, "train/loss": 2.354960083961487, "train/z_loss": 0.0013500575441867113, "train/perplexity": 10.537708229733012, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698015.2084278113, "perf/iters_per_sec": 0.8096767465724045, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2350607872009278, "data/tokens_consumed": 6146752512, "data/tokens_consumed_B": 6.146752512, "train/loss_slope": -1.5926862242269757e-05} {"step": 2940, "timestamp": 1778328893.2935615, "train/loss": 2.392974853515625, "train/z_loss": 0.0013388226740062236, "train/perplexity": 10.946008327653182, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020739.8917858826, "perf/iters_per_sec": 0.9635638674668706, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037813925743103, "data/tokens_consumed": 6167724032, "data/tokens_consumed_B": 6.167724032, "train/loss_slope": -1.4540045975994999e-05} {"step": 2950, "timestamp": 1778328903.6632395, "grad/layer_0/attn": 0.0031360734719783068, "grad/layer_0/mlp": 0.003369903191924095, "grad/layer_0/attn_mlp_ratio": 0.930612305550086, "grad/layer_4/attn": 0.002492590807378292, "grad/layer_4/mlp": 0.002643345156684518, "grad/layer_4/attn_mlp_ratio": 0.9429683092191622, "grad/layer_8/attn": 0.005704347975552082, "grad/layer_8/mlp": 0.0037893992848694324, "grad/layer_8/attn_mlp_ratio": 1.5053435640299053, "grad/layer_12/attn": 0.005186979193240404, "grad/layer_12/mlp": 0.006869042292237282, "grad/layer_12/attn_mlp_ratio": 0.7551240619947578, "grad/layer_16/attn": 0.00890668947249651, "grad/layer_16/mlp": 0.004326420836150646, "grad/layer_16/attn_mlp_ratio": 2.0586738100479596, "grad/layer_20/attn": 0.0031640881206840277, "grad/layer_20/mlp": 0.005280760116875172, "grad/layer_20/attn_mlp_ratio": 0.5991728445788702, "grad/layer_24/attn": 0.0056037395261228085, "grad/layer_24/mlp": 0.007528161164373159, "grad/layer_24/attn_mlp_ratio": 0.744370282374578, "grad/layer_27/attn": 0.004530389327555895, "grad/layer_27/mlp": 0.006711994297802448, "grad/layer_27/attn_mlp_ratio": 0.6749691759336353} {"step": 2950, "timestamp": 1778328903.6789591, "train/loss": 2.412852144241333, "train/z_loss": 0.0013575156452134251, "train/perplexity": 11.165762137178996, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020184.7816119746, "perf/iters_per_sec": 0.9632991703090547, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380990982055665, "data/tokens_consumed": 6188695552, "data/tokens_consumed_B": 6.188695552, "train/loss_slope": -1.444271461333449e-05} {"step": 2960, "timestamp": 1778328914.0576684, "train/loss": 2.413318562507629, "train/z_loss": 0.0013487383956089615, "train/perplexity": 11.170971267319183, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022066.131623573, "perf/iters_per_sec": 0.9641962679021706, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037133240699768, "data/tokens_consumed": 6209667072, "data/tokens_consumed_B": 6.209667072, "train/loss_slope": -1.0853683143773674e-05} {"step": 2970, "timestamp": 1778328924.4407218, "train/loss": 2.411752486228943, "train/z_loss": 0.0013414529035799204, "train/perplexity": 11.153490365992793, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021136.5128875156, "perf/iters_per_sec": 0.9637529911458567, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376102685928346, "data/tokens_consumed": 6230638592, "data/tokens_consumed_B": 6.230638592, "train/loss_slope": -9.18209005539533e-06} {"step": 2980, "timestamp": 1778328934.8165925, "train/loss": 2.3907907485961912, "train/z_loss": 0.0013448975048959255, "train/perplexity": 10.922127185969487, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021994.8281082353, "perf/iters_per_sec": 0.9641622677365471, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371698141098022, "data/tokens_consumed": 6251610112, "data/tokens_consumed_B": 6.251610112, "train/loss_slope": -9.633832731799217e-06} {"step": 2990, "timestamp": 1778328945.1981142, "train/loss": 2.384182167053223, "train/z_loss": 0.001354413665831089, "train/perplexity": 10.850185396258219, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021430.527127019, "perf/iters_per_sec": 0.9638931880602927, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374593496322633, "data/tokens_consumed": 6272581632, "data/tokens_consumed_B": 6.272581632, "train/loss_slope": -8.776605290190613e-06} {"step": 3000, "timestamp": 1778328955.5715477, "grad/layer_0/attn": 0.0026914416812360287, "grad/layer_0/mlp": 0.0030635432340204716, "grad/layer_0/attn_mlp_ratio": 0.8785387989612304, "grad/layer_4/attn": 0.0022125819232314825, "grad/layer_4/mlp": 0.002756988862529397, "grad/layer_4/attn_mlp_ratio": 0.8025356478763525, "grad/layer_8/attn": 0.006048909388482571, "grad/layer_8/mlp": 0.0036097869742661715, "grad/layer_8/attn_mlp_ratio": 1.675697004846815, "grad/layer_12/attn": 0.006768052466213703, "grad/layer_12/mlp": 0.006910968571901321, "grad/layer_12/attn_mlp_ratio": 0.9793203800404005, "grad/layer_16/attn": 0.004069547168910503, "grad/layer_16/mlp": 0.004464030731469393, "grad/layer_16/attn_mlp_ratio": 0.9116306142471153, "grad/layer_20/attn": 0.00281270919367671, "grad/layer_20/mlp": 0.005735678598284721, "grad/layer_20/attn_mlp_ratio": 0.49038820715634973, "grad/layer_24/attn": 0.006396738346666098, "grad/layer_24/mlp": 0.008095446974039078, "grad/layer_24/attn_mlp_ratio": 0.7901649270464022, "grad/layer_27/attn": 0.003441384295001626, "grad/layer_27/mlp": 0.007193953264504671, "grad/layer_27/attn_mlp_ratio": 0.4783717826114345} {"step": 3000, "timestamp": 1778328956.1743429, "eos/sharpness": 8.101558685302733, "eos/L0_probe": 2.3806004524230957, "eos/L_plus": 2.418637752532959, "eos/L_minus": 2.4235787391662598, "eos/grad_norm": 0.08919170498847961, "eos/embed_grad_frac": 0.31009554862976074, "eos/time_s": 0.6000854969024658} {"step": 3000, "timestamp": 1778328956.195753, "train/loss": 2.3999458074569704, "train/z_loss": 0.0013502887333743275, "train/perplexity": 11.022579022867586, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908183.258499976, "perf/iters_per_sec": 0.9098926823139076, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0990307092666627, "data/tokens_consumed": 6293553152, "data/tokens_consumed_B": 6.293553152, "train/loss_slope": -6.684961227407864e-06} {"step": 3000, "timestamp": 1778328957.557893, "geo/rankme_last": 424.997802734375, "geo/layer_0/stable_rank_q_proj": 20.42576789855957, "geo/layer_0/stable_rank_k_proj": 17.168197631835938, "geo/layer_0/stable_rank_o_proj": 45.70833206176758, "geo/layer_0/stable_rank_gate_proj": 129.9568634033203, "geo/layer_0/stable_rank_down_proj": 56.05083465576172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06562094390392303, "geo/layer_0/attn_entropy_mean": 6.250388145446777, "geo/layer_0/attn_entropy_std": 0.42161324620246887, "geo/layer_7/stable_rank_q_proj": 42.658287048339844, "geo/layer_7/stable_rank_k_proj": 39.59920883178711, "geo/layer_7/stable_rank_o_proj": 89.40833282470703, "geo/layer_7/stable_rank_gate_proj": 79.45134735107422, "geo/layer_7/stable_rank_down_proj": 143.16314697265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43643128871917725, "geo/layer_7/attn_entropy_mean": 4.752960205078125, "geo/layer_7/attn_entropy_std": 0.767749547958374, "geo/layer_14/stable_rank_q_proj": 51.14086151123047, "geo/layer_14/stable_rank_k_proj": 41.45697784423828, "geo/layer_14/stable_rank_o_proj": 42.90315628051758, "geo/layer_14/stable_rank_gate_proj": 71.47592163085938, "geo/layer_14/stable_rank_down_proj": 127.58755493164062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3738726079463959, "geo/layer_14/attn_entropy_mean": 5.537776947021484, "geo/layer_14/attn_entropy_std": 0.43631863594055176, "geo/layer_21/stable_rank_q_proj": 39.884521484375, "geo/layer_21/stable_rank_k_proj": 29.516204833984375, "geo/layer_21/stable_rank_o_proj": 66.69998168945312, "geo/layer_21/stable_rank_gate_proj": 63.397586822509766, "geo/layer_21/stable_rank_down_proj": 49.85918426513672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1431637406349182, "geo/layer_21/attn_entropy_mean": 5.881137847900391, "geo/layer_21/attn_entropy_std": 0.3060644268989563, "geo/layer_27/stable_rank_q_proj": 43.73058319091797, "geo/layer_27/stable_rank_k_proj": 31.0375919342041, "geo/layer_27/stable_rank_o_proj": 112.21544647216797, "geo/layer_27/stable_rank_gate_proj": 74.84366607666016, "geo/layer_27/stable_rank_down_proj": 127.19879150390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10127515345811844, "geo/layer_27/attn_entropy_mean": 4.326253890991211, "geo/layer_27/attn_entropy_std": 0.6655479669570923, "attnres/final_alpha/block_0": 0.2508006691932678, "attnres/block_norm/0": 1.7792118787765503, "attnres/final_alpha/block_1": 0.004130593966692686, "attnres/block_norm/1": 49319.640625, "attnres/final_alpha/block_2": 0.008908728137612343, "attnres/block_norm/2": 29590.6796875, "attnres/final_alpha/block_3": 0.01068219542503357, "attnres/block_norm/3": 67202.15625, "attnres/final_alpha/block_4": 0.012376796454191208, "attnres/block_norm/4": 16566.35546875, "attnres/final_alpha/block_5": 0.6085389256477356, "attnres/block_norm/5": 6930.80126953125, "attnres/final_alpha/block_6": 0.10456208884716034, "attnres/block_norm/6": 44906.8828125, "geo/tier1_time_s": 1.3579051494598389, "geo/step": 3000.0, "geo/rankme_slope": 0.0022957390088514445} {"step": 3000, "timestamp": 1778328964.5131276, "geo/ww_alpha_mean": 7.656428551717557, "geo/ww_alpha_std": 4.67510116584856, "geo/ww_alpha_min": 1.3614793392657178, "geo/ww_alpha_max": 28.44649452364904, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.8931134516133263, "geo/ww_alpha_by_type/k_proj": 4.519123013583394, "geo/ww_alpha_by_type/v_proj": 8.19295925857877, "geo/ww_alpha_by_type/o_proj": 9.172061542238529, "geo/ww_alpha_by_type/gate_proj": 7.97611947515638, "geo/ww_alpha_by_type/up_proj": 12.033870691982482, "geo/ww_alpha_by_type/down_proj": 7.902180121546586, "geo/twonn_id/layer_0": 0.7424030303955078, "geo/twonn_id/layer_7": 4.01209831237793, "geo/twonn_id/layer_14": 5.495733261108398, "geo/twonn_id/layer_21": 8.086136817932129, "geo/twonn_id/layer_27": 6.375935077667236, "geo/tier2_time_s": 6.94931173324585} {"step": 3000, "timestamp": 1778328965.2666268, "eoc/jacobian_sigma/layer_0/attn": 1392.705078125, "eoc/jacobian_sigma/layer_0/mlp": 11153.015625, "eoc/jacobian_sigma/layer_0": 11153.015625, "eoc/jacobian_sigma/layer_7/attn": 1.0823513269424438, "eoc/jacobian_sigma/layer_7/mlp": 1.8155405521392822, "eoc/jacobian_sigma/layer_7": 1.8155405521392822, "eoc/jacobian_sigma/layer_14/attn": 1.9243468046188354, "eoc/jacobian_sigma/layer_14/mlp": 12.671727180480957, "eoc/jacobian_sigma/layer_14": 12.671727180480957, "eoc/jacobian_sigma/layer_21/attn": 1.0415327548980713, "eoc/jacobian_sigma/layer_21/mlp": 4.576686859130859, "eoc/jacobian_sigma/layer_21": 4.576686859130859, "eoc/jacobian_sigma/layer_27/attn": 3.574556827545166, "eoc/jacobian_sigma/layer_27/mlp": 26.62051773071289, "eoc/jacobian_sigma/layer_27": 26.62051773071289, "eoc/layer0_sigma": 11153.015625, "eoc/sigma_max": 26.62051773071289, "eoc/sigma_min": 1.8155405521392822, "eoc/sigma_mean": 11.421118080615997, "eoc/time_s": 0.7470986843109131} {"step": 3010, "timestamp": 1778328975.6662538, "train/loss": 2.3600977659225464, "train/z_loss": 0.0013561409316025675, "train/perplexity": 10.591986937189908, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1077308.8709982443, "perf/iters_per_sec": 0.5137009005538198, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9466580629348755, "data/tokens_consumed": 6314524672, "data/tokens_consumed_B": 6.314524672, "train/loss_slope": -7.662725834885157e-06} {"step": 3020, "timestamp": 1778328986.0429456, "train/loss": 2.3424972772598265, "train/z_loss": 0.0013747977325692772, "train/perplexity": 10.407193786431117, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022094.7660019028, "perf/iters_per_sec": 0.9642099218377603, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371185541152954, "data/tokens_consumed": 6335496192, "data/tokens_consumed_B": 6.335496192, "train/loss_slope": -9.952374245717777e-06} {"step": 3030, "timestamp": 1778328996.4215136, "train/loss": 2.3807586431503296, "train/z_loss": 0.0013549727736972272, "train/perplexity": 10.81310303958921, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021979.9079958298, "perf/iters_per_sec": 0.9641551532725476, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371774673461913, "data/tokens_consumed": 6356467712, "data/tokens_consumed_B": 6.356467712, "train/loss_slope": -1.0267626784994519e-05} {"step": 3040, "timestamp": 1778329006.8011677, "train/loss": 2.4231173753738404, "train/z_loss": 0.0013457765802741052, "train/perplexity": 11.280971580388474, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021348.7240561855, "perf/iters_per_sec": 0.9638541813164642, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037501335144043, "data/tokens_consumed": 6377439232, "data/tokens_consumed_B": 6.377439232, "train/loss_slope": -6.14723634190511e-06} {"step": 3050, "timestamp": 1778329017.1742392, "grad/layer_0/attn": 0.003222245955839753, "grad/layer_0/mlp": 0.003317087423056364, "grad/layer_0/attn_mlp_ratio": 0.9714081806532425, "grad/layer_4/attn": 0.002049959497526288, "grad/layer_4/mlp": 0.0027006620075553656, "grad/layer_4/attn_mlp_ratio": 0.7590581183004443, "grad/layer_8/attn": 0.004155359696596861, "grad/layer_8/mlp": 0.0036563249304890633, "grad/layer_8/attn_mlp_ratio": 1.1364853129703973, "grad/layer_12/attn": 0.011453661136329174, "grad/layer_12/mlp": 0.007284415420144796, "grad/layer_12/attn_mlp_ratio": 1.5723514267760366, "grad/layer_16/attn": 0.0038260980509221554, "grad/layer_16/mlp": 0.0047921896912157536, "grad/layer_16/attn_mlp_ratio": 0.7984028633288947, "grad/layer_20/attn": 0.004434074740856886, "grad/layer_20/mlp": 0.006725218612700701, "grad/layer_20/attn_mlp_ratio": 0.6593205262578381, "grad/layer_24/attn": 0.013443737290799618, "grad/layer_24/mlp": 0.01127729844301939, "grad/layer_24/attn_mlp_ratio": 1.192106180351255, "grad/layer_27/attn": 0.005309418775141239, "grad/layer_27/mlp": 0.011557354591786861, "grad/layer_27/attn_mlp_ratio": 0.4593974068230625} {"step": 3050, "timestamp": 1778329017.189667, "train/loss": 2.424796485900879, "train/z_loss": 0.0013602802529931069, "train/perplexity": 11.299929490283182, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019935.1023962616, "perf/iters_per_sec": 0.9631801139813717, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038227415084839, "data/tokens_consumed": 6398410752, "data/tokens_consumed_B": 6.398410752, "train/loss_slope": -6.772705788301976e-06} {"step": 3060, "timestamp": 1778329027.570807, "train/loss": 2.4122351884841917, "train/z_loss": 0.0013476476655341686, "train/perplexity": 11.158875480545262, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021562.373608125, "perf/iters_per_sec": 0.9639560573616623, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037391686439514, "data/tokens_consumed": 6419382272, "data/tokens_consumed_B": 6.419382272, "train/loss_slope": -5.492809503385407e-06} {"step": 3070, "timestamp": 1778329037.9502501, "train/loss": 2.406696057319641, "train/z_loss": 0.0013533916790038347, "train/perplexity": 11.097235878132958, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021317.9276820263, "perf/iters_per_sec": 0.9638394964609271, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375171422958374, "data/tokens_consumed": 6440353792, "data/tokens_consumed_B": 6.440353792, "train/loss_slope": -5.300454709968427e-06} {"step": 3075, "timestamp": 1778329043.738828, "eos/sharpness": 47.526788711547844, "eos/L0_probe": 2.377575397491455, "eos/L_plus": 2.615093231201172, "eos/L_minus": 2.615325450897217, "eos/grad_norm": 0.16335423290729523, "eos/embed_grad_frac": 0.08803277462720871, "eos/time_s": 0.6110091209411621} {"step": 3075, "timestamp": 1778329045.1201413, "geo/rankme_last": 424.3453369140625, "geo/layer_0/stable_rank_q_proj": 20.41908073425293, "geo/layer_0/stable_rank_k_proj": 17.138322830200195, "geo/layer_0/stable_rank_o_proj": 45.71162414550781, "geo/layer_0/stable_rank_gate_proj": 129.95016479492188, "geo/layer_0/stable_rank_down_proj": 55.97987747192383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06606649607419968, "geo/layer_0/attn_entropy_mean": 6.257104873657227, "geo/layer_0/attn_entropy_std": 0.418660968542099, "geo/layer_7/stable_rank_q_proj": 42.63197326660156, "geo/layer_7/stable_rank_k_proj": 39.46126174926758, "geo/layer_7/stable_rank_o_proj": 89.4622802734375, "geo/layer_7/stable_rank_gate_proj": 79.59355926513672, "geo/layer_7/stable_rank_down_proj": 143.23106384277344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42185288667678833, "geo/layer_7/attn_entropy_mean": 4.720189571380615, "geo/layer_7/attn_entropy_std": 0.7615688443183899, "geo/layer_14/stable_rank_q_proj": 51.24660110473633, "geo/layer_14/stable_rank_k_proj": 41.54304122924805, "geo/layer_14/stable_rank_o_proj": 42.86320495605469, "geo/layer_14/stable_rank_gate_proj": 71.48516082763672, "geo/layer_14/stable_rank_down_proj": 127.70356750488281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.369546115398407, "geo/layer_14/attn_entropy_mean": 5.545731544494629, "geo/layer_14/attn_entropy_std": 0.4212215542793274, "geo/layer_21/stable_rank_q_proj": 39.86489486694336, "geo/layer_21/stable_rank_k_proj": 29.483644485473633, "geo/layer_21/stable_rank_o_proj": 66.67218780517578, "geo/layer_21/stable_rank_gate_proj": 63.36014938354492, "geo/layer_21/stable_rank_down_proj": 49.83808517456055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13804033398628235, "geo/layer_21/attn_entropy_mean": 5.867279529571533, "geo/layer_21/attn_entropy_std": 0.31659480929374695, "geo/layer_27/stable_rank_q_proj": 43.723487854003906, "geo/layer_27/stable_rank_k_proj": 31.10055160522461, "geo/layer_27/stable_rank_o_proj": 112.08594512939453, "geo/layer_27/stable_rank_gate_proj": 74.76685333251953, "geo/layer_27/stable_rank_down_proj": 127.1920166015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10593735426664352, "geo/layer_27/attn_entropy_mean": 4.320639610290527, "geo/layer_27/attn_entropy_std": 0.6659321784973145, "attnres/final_alpha/block_0": 0.25261539220809937, "attnres/block_norm/0": 1.7792768478393555, "attnres/final_alpha/block_1": 0.0041323937475681305, "attnres/block_norm/1": 49448.4453125, "attnres/final_alpha/block_2": 0.008947299793362617, "attnres/block_norm/2": 29622.140625, "attnres/final_alpha/block_3": 0.010718920268118382, "attnres/block_norm/3": 67956.234375, "attnres/final_alpha/block_4": 0.012306928634643555, "attnres/block_norm/4": 16591.673828125, "attnres/final_alpha/block_5": 0.6058710813522339, "attnres/block_norm/5": 6966.21630859375, "attnres/final_alpha/block_6": 0.10540799796581268, "attnres/block_norm/6": 45094.01953125, "geo/tier1_time_s": 1.3616032600402832, "geo/step": 3075.0, "geo/rankme_slope": 0.0022269976905719925} {"step": 3080, "timestamp": 1778329050.309168, "train/loss": 2.3849992513656617, "train/z_loss": 0.0013584837433882058, "train/perplexity": 10.859054535456366, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697755.0500070045, "perf/iters_per_sec": 0.8095526933703444, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2352500438690186, "data/tokens_consumed": 6461325312, "data/tokens_consumed_B": 6.461325312, "train/loss_slope": -5.592518971555102e-06} {"step": 3090, "timestamp": 1778329060.6890483, "train/loss": 2.3930776596069334, "train/z_loss": 0.0013572038267739117, "train/perplexity": 10.947133701831445, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021613.5744464311, "perf/iters_per_sec": 0.9639804718238979, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373654127120973, "data/tokens_consumed": 6482296832, "data/tokens_consumed_B": 6.482296832, "train/loss_slope": -5.385950124553536e-06} {"step": 3100, "timestamp": 1778329071.056164, "grad/layer_0/attn": 0.002806351287290454, "grad/layer_0/mlp": 0.0029764475766569376, "grad/layer_0/attn_mlp_ratio": 0.9428525518185721, "grad/layer_4/attn": 0.002552880672737956, "grad/layer_4/mlp": 0.002619535895064473, "grad/layer_4/attn_mlp_ratio": 0.9745545308588606, "grad/layer_8/attn": 0.004173241090029478, "grad/layer_8/mlp": 0.0034389998763799667, "grad/layer_8/attn_mlp_ratio": 1.2135042508556233, "grad/layer_12/attn": 0.010252474807202816, "grad/layer_12/mlp": 0.006669799797236919, "grad/layer_12/attn_mlp_ratio": 1.5371487848458667, "grad/layer_16/attn": 0.003259087447077036, "grad/layer_16/mlp": 0.004353546537458897, "grad/layer_16/attn_mlp_ratio": 0.7486051531032454, "grad/layer_20/attn": 0.003996995277702808, "grad/layer_20/mlp": 0.005897800903767347, "grad/layer_20/attn_mlp_ratio": 0.6777094166367502, "grad/layer_24/attn": 0.0063480110839009285, "grad/layer_24/mlp": 0.008627482689917088, "grad/layer_24/attn_mlp_ratio": 0.7357894809503219, "grad/layer_27/attn": 0.004457706119865179, "grad/layer_27/mlp": 0.008323160000145435, "grad/layer_27/attn_mlp_ratio": 0.5355785622563343} {"step": 3100, "timestamp": 1778329071.0718892, "train/loss": 2.373682427406311, "train/z_loss": 0.0013518007355742157, "train/perplexity": 10.736857273517089, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021059.7488443938, "perf/iters_per_sec": 0.9637163871976823, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03764967918396, "data/tokens_consumed": 6503268352, "data/tokens_consumed_B": 6.503268352, "train/loss_slope": -4.185539775519995e-06} {"step": 3110, "timestamp": 1778329081.4512875, "train/loss": 2.3653480768203736, "train/z_loss": 0.0013709368999116124, "train/perplexity": 10.647744405574784, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021300.5557823763, "perf/iters_per_sec": 0.9638312128936655, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375260591506958, "data/tokens_consumed": 6524239872, "data/tokens_consumed_B": 6.524239872, "train/loss_slope": -5.141982980723078e-06} {"step": 3120, "timestamp": 1778329091.8372567, "train/loss": 2.4123064517974853, "train/z_loss": 0.0013503782334737479, "train/perplexity": 11.15967072732026, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020285.1902828526, "perf/iters_per_sec": 0.9633470488943351, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380475044250488, "data/tokens_consumed": 6545211392, "data/tokens_consumed_B": 6.545211392, "train/loss_slope": -2.8546511715610153e-06} {"step": 3130, "timestamp": 1778329102.2143233, "train/loss": 2.391854667663574, "train/z_loss": 0.0013495841645635664, "train/perplexity": 10.933753629039568, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022217.8658778414, "perf/iters_per_sec": 0.9642686204327781, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370554208755494, "data/tokens_consumed": 6566182912, "data/tokens_consumed_B": 6.566182912, "train/loss_slope": -4.231844011313138e-06} {"step": 3140, "timestamp": 1778329112.5904062, "train/loss": 2.3663871765136717, "train/z_loss": 0.001351691025774926, "train/perplexity": 10.658814223847285, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022125.7255116578, "perf/iters_per_sec": 0.9642246844824113, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371026754379273, "data/tokens_consumed": 6587154432, "data/tokens_consumed_B": 6.587154432, "train/loss_slope": -5.380910515177275e-06} {"step": 3150, "timestamp": 1778329122.961741, "grad/layer_0/attn": 0.003168596187606454, "grad/layer_0/mlp": 0.0032576376106590033, "grad/layer_0/attn_mlp_ratio": 0.9726668429821999, "grad/layer_4/attn": 0.00236925994977355, "grad/layer_4/mlp": 0.002612575888633728, "grad/layer_4/attn_mlp_ratio": 0.9068673830278052, "grad/layer_8/attn": 0.004728594329208136, "grad/layer_8/mlp": 0.0034347211476415396, "grad/layer_8/attn_mlp_ratio": 1.3767039559484011, "grad/layer_12/attn": 0.004834852181375027, "grad/layer_12/mlp": 0.006570089142769575, "grad/layer_12/attn_mlp_ratio": 0.735888357482482, "grad/layer_16/attn": 0.003277850104495883, "grad/layer_16/mlp": 0.004057120531797409, "grad/layer_16/attn_mlp_ratio": 0.8079252262814062, "grad/layer_20/attn": 0.006433728616684675, "grad/layer_20/mlp": 0.006069473922252655, "grad/layer_20/attn_mlp_ratio": 1.0600141944914083, "grad/layer_24/attn": 0.009588109329342842, "grad/layer_24/mlp": 0.010357878170907497, "grad/layer_24/attn_mlp_ratio": 0.9256827584345406, "grad/layer_27/attn": 0.006729492451995611, "grad/layer_27/mlp": 0.010332171805202961, "grad/layer_27/attn_mlp_ratio": 0.6513144103425969} {"step": 3150, "timestamp": 1778329123.5661275, "eos/sharpness": 40.42189121246337, "eos/L0_probe": 2.379246234893799, "eos/L_plus": 2.5678184032440186, "eos/L_minus": 2.594892978668213, "eos/grad_norm": 0.13966326415538788, "eos/embed_grad_frac": 0.1454601287841797, "eos/time_s": 0.6013681888580322} {"step": 3150, "timestamp": 1778329123.5882077, "train/loss": 2.374447393417358, "train/z_loss": 0.0013484449824318289, "train/perplexity": 10.745073746657466, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908451.4959267923, "perf/iters_per_sec": 0.9100205878862344, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0988762378692627, "data/tokens_consumed": 6608125952, "data/tokens_consumed_B": 6.608125952, "train/loss_slope": -7.823252842442836e-06} {"step": 3150, "timestamp": 1778329124.9539728, "geo/rankme_last": 424.438232421875, "geo/layer_0/stable_rank_q_proj": 20.40886116027832, "geo/layer_0/stable_rank_k_proj": 17.135589599609375, "geo/layer_0/stable_rank_o_proj": 45.68760299682617, "geo/layer_0/stable_rank_gate_proj": 130.03114318847656, "geo/layer_0/stable_rank_down_proj": 55.98126220703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06801709532737732, "geo/layer_0/attn_entropy_mean": 6.259518146514893, "geo/layer_0/attn_entropy_std": 0.42033934593200684, "geo/layer_7/stable_rank_q_proj": 42.60823059082031, "geo/layer_7/stable_rank_k_proj": 39.535606384277344, "geo/layer_7/stable_rank_o_proj": 89.49504089355469, "geo/layer_7/stable_rank_gate_proj": 79.75581359863281, "geo/layer_7/stable_rank_down_proj": 143.1750946044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4270194172859192, "geo/layer_7/attn_entropy_mean": 4.7423858642578125, "geo/layer_7/attn_entropy_std": 0.77973473072052, "geo/layer_14/stable_rank_q_proj": 51.29340744018555, "geo/layer_14/stable_rank_k_proj": 41.587608337402344, "geo/layer_14/stable_rank_o_proj": 42.82352066040039, "geo/layer_14/stable_rank_gate_proj": 71.40901184082031, "geo/layer_14/stable_rank_down_proj": 128.04226684570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3896174132823944, "geo/layer_14/attn_entropy_mean": 5.532320499420166, "geo/layer_14/attn_entropy_std": 0.41959476470947266, "geo/layer_21/stable_rank_q_proj": 39.81985092163086, "geo/layer_21/stable_rank_k_proj": 29.41814613342285, "geo/layer_21/stable_rank_o_proj": 66.720703125, "geo/layer_21/stable_rank_gate_proj": 63.23733139038086, "geo/layer_21/stable_rank_down_proj": 49.82304000854492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14294037222862244, "geo/layer_21/attn_entropy_mean": 5.866702079772949, "geo/layer_21/attn_entropy_std": 0.3179542124271393, "geo/layer_27/stable_rank_q_proj": 43.65167236328125, "geo/layer_27/stable_rank_k_proj": 31.083423614501953, "geo/layer_27/stable_rank_o_proj": 112.12466430664062, "geo/layer_27/stable_rank_gate_proj": 74.69110107421875, "geo/layer_27/stable_rank_down_proj": 126.94816589355469, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11003845930099487, "geo/layer_27/attn_entropy_mean": 4.306217670440674, "geo/layer_27/attn_entropy_std": 0.6479353308677673, "attnres/final_alpha/block_0": 0.25040319561958313, "attnres/block_norm/0": 1.779158115386963, "attnres/final_alpha/block_1": 0.004081286024302244, "attnres/block_norm/1": 49460.6796875, "attnres/final_alpha/block_2": 0.008676847442984581, "attnres/block_norm/2": 29675.6328125, "attnres/final_alpha/block_3": 0.010454105213284492, "attnres/block_norm/3": 68217.484375, "attnres/final_alpha/block_4": 0.012107457965612411, "attnres/block_norm/4": 16531.73828125, "attnres/final_alpha/block_5": 0.6104861497879028, "attnres/block_norm/5": 6918.92724609375, "attnres/final_alpha/block_6": 0.10379096120595932, "attnres/block_norm/6": 45419.546875, "geo/tier1_time_s": 1.36232328414917, "geo/step": 3150.0, "geo/rankme_slope": 0.0021642777606054315} {"step": 3160, "timestamp": 1778329135.3320918, "train/loss": 2.4097633838653563, "train/z_loss": 0.0013460141490213573, "train/perplexity": 11.131326981870933, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786226.7095351128, "perf/iters_per_sec": 0.851739268081242, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1740682125091553, "data/tokens_consumed": 6629097472, "data/tokens_consumed_B": 6.629097472, "train/loss_slope": -4.6168544218772275e-06} {"step": 3170, "timestamp": 1778329146.0989168, "train/loss": 2.3820357799530028, "train/z_loss": 0.001346228423062712, "train/perplexity": 10.826921673696985, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948680.8111442097, "perf/iters_per_sec": 0.9292034202309655, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0761906147003173, "data/tokens_consumed": 6650068992, "data/tokens_consumed_B": 6.650068992, "train/loss_slope": -7.1379194737483245e-06} {"step": 3180, "timestamp": 1778329156.4800816, "train/loss": 2.359729290008545, "train/z_loss": 0.0013576455530710518, "train/perplexity": 10.588084764094686, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021774.4883369484, "perf/iters_per_sec": 0.9640572015461676, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372828483581542, "data/tokens_consumed": 6671040512, "data/tokens_consumed_B": 6.671040512, "train/loss_slope": -8.896995084335194e-06} {"step": 3190, "timestamp": 1778329166.8392498, "train/loss": 2.381182837486267, "train/z_loss": 0.0013455674052238464, "train/perplexity": 10.817690869649478, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025519.4379321295, "perf/iters_per_sec": 0.9658429326687477, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353650331497193, "data/tokens_consumed": 6692012032, "data/tokens_consumed_B": 6.692012032, "train/loss_slope": -9.477884076287684e-06} {"step": 3200, "timestamp": 1778329177.1845107, "grad/layer_0/attn": 0.00304296868853271, "grad/layer_0/mlp": 0.0032373119611293077, "grad/layer_0/attn_mlp_ratio": 0.9399676741299995, "grad/layer_4/attn": 0.003280151169747114, "grad/layer_4/mlp": 0.0026221410371363163, "grad/layer_4/attn_mlp_ratio": 1.2509437891392152, "grad/layer_8/attn": 0.005605255253612995, "grad/layer_8/mlp": 0.003572144778445363, "grad/layer_8/attn_mlp_ratio": 1.569156751573991, "grad/layer_12/attn": 0.00838461983948946, "grad/layer_12/mlp": 0.007216737139970064, "grad/layer_12/attn_mlp_ratio": 1.1618297245257942, "grad/layer_16/attn": 0.004540594760328531, "grad/layer_16/mlp": 0.005402791313827038, "grad/layer_16/attn_mlp_ratio": 0.8404164463406938, "grad/layer_20/attn": 0.0032391920685768127, "grad/layer_20/mlp": 0.007378846872597933, "grad/layer_20/attn_mlp_ratio": 0.4389834998077434, "grad/layer_24/attn": 0.01599801890552044, "grad/layer_24/mlp": 0.014086724258959293, "grad/layer_24/attn_mlp_ratio": 1.1356805526861569, "grad/layer_27/attn": 0.004591530654579401, "grad/layer_27/mlp": 0.014422635547816753, "grad/layer_27/attn_mlp_ratio": 0.3183558655088434} {"step": 3200, "timestamp": 1778329177.2005477, "train/loss": 2.3510676860809325, "train/z_loss": 0.0013494957587681712, "train/perplexity": 10.496771000232208, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025378.7735337557, "perf/iters_per_sec": 0.9657758586567667, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354369401931762, "data/tokens_consumed": 6712983552, "data/tokens_consumed_B": 6.712983552, "train/loss_slope": -1.2381415777724433e-05} {"step": 3210, "timestamp": 1778329187.5656574, "train/loss": 2.3377023935317993, "train/z_loss": 0.0013658120878972113, "train/perplexity": 10.357411946734135, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024590.696171488, "perf/iters_per_sec": 0.9654000740868988, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358399868011474, "data/tokens_consumed": 6733955072, "data/tokens_consumed_B": 6.733955072, "train/loss_slope": -1.4964894535947244e-05} {"step": 3220, "timestamp": 1778329197.9237924, "train/loss": 2.3471269607543945, "train/z_loss": 0.001354713412001729, "train/perplexity": 10.455487505786145, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025911.3118122993, "perf/iters_per_sec": 0.9660297926961419, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035164761543274, "data/tokens_consumed": 6754926592, "data/tokens_consumed_B": 6.754926592, "train/loss_slope": -1.5804888255740052e-05} {"step": 3225, "timestamp": 1778329203.6888068, "eos/sharpness": 56.40633106231688, "eos/L0_probe": 2.3775017261505127, "eos/L_plus": 2.5889649391174316, "eos/L_minus": 2.7301018238067627, "eos/grad_norm": 0.14036248624324799, "eos/embed_grad_frac": 0.1169256642460823, "eos/time_s": 0.5952601432800293} {"step": 3225, "timestamp": 1778329205.0694928, "geo/rankme_last": 424.3253173828125, "geo/layer_0/stable_rank_q_proj": 20.394014358520508, "geo/layer_0/stable_rank_k_proj": 17.18608856201172, "geo/layer_0/stable_rank_o_proj": 45.64943313598633, "geo/layer_0/stable_rank_gate_proj": 130.29751586914062, "geo/layer_0/stable_rank_down_proj": 56.08882141113281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06775901466608047, "geo/layer_0/attn_entropy_mean": 6.259912967681885, "geo/layer_0/attn_entropy_std": 0.42014533281326294, "geo/layer_7/stable_rank_q_proj": 42.62849044799805, "geo/layer_7/stable_rank_k_proj": 39.60236358642578, "geo/layer_7/stable_rank_o_proj": 89.60831451416016, "geo/layer_7/stable_rank_gate_proj": 79.81010437011719, "geo/layer_7/stable_rank_down_proj": 143.1210174560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42692384123802185, "geo/layer_7/attn_entropy_mean": 4.7367353439331055, "geo/layer_7/attn_entropy_std": 0.7602463960647583, "geo/layer_14/stable_rank_q_proj": 51.36665344238281, "geo/layer_14/stable_rank_k_proj": 41.686424255371094, "geo/layer_14/stable_rank_o_proj": 42.7999382019043, "geo/layer_14/stable_rank_gate_proj": 71.46614074707031, "geo/layer_14/stable_rank_down_proj": 127.82340240478516, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3788894712924957, "geo/layer_14/attn_entropy_mean": 5.5482330322265625, "geo/layer_14/attn_entropy_std": 0.4039302170276642, "geo/layer_21/stable_rank_q_proj": 39.86863708496094, "geo/layer_21/stable_rank_k_proj": 29.486797332763672, "geo/layer_21/stable_rank_o_proj": 66.59294891357422, "geo/layer_21/stable_rank_gate_proj": 63.16207504272461, "geo/layer_21/stable_rank_down_proj": 49.83823013305664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13813835382461548, "geo/layer_21/attn_entropy_mean": 5.859379291534424, "geo/layer_21/attn_entropy_std": 0.3262949585914612, "geo/layer_27/stable_rank_q_proj": 43.69575119018555, "geo/layer_27/stable_rank_k_proj": 30.986942291259766, "geo/layer_27/stable_rank_o_proj": 111.7962875366211, "geo/layer_27/stable_rank_gate_proj": 74.6506118774414, "geo/layer_27/stable_rank_down_proj": 127.19232177734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1031402125954628, "geo/layer_27/attn_entropy_mean": 4.29435920715332, "geo/layer_27/attn_entropy_std": 0.6484919190406799, "attnres/final_alpha/block_0": 0.25160208344459534, "attnres/block_norm/0": 1.7791125774383545, "attnres/final_alpha/block_1": 0.004037095699459314, "attnres/block_norm/1": 49455.7734375, "attnres/final_alpha/block_2": 0.008791111409664154, "attnres/block_norm/2": 29678.396484375, "attnres/final_alpha/block_3": 0.010653309524059296, "attnres/block_norm/3": 68327.234375, "attnres/final_alpha/block_4": 0.01234695129096508, "attnres/block_norm/4": 16512.263671875, "attnres/final_alpha/block_5": 0.609123170375824, "attnres/block_norm/5": 6903.78515625, "attnres/final_alpha/block_6": 0.10344626009464264, "attnres/block_norm/6": 45316.2421875, "geo/tier1_time_s": 1.36118745803833, "geo/step": 3225.0, "geo/rankme_slope": 0.002098609786317022} {"step": 3230, "timestamp": 1778329210.2459228, "train/loss": 2.3826461315155028, "train/z_loss": 0.0013571899035014212, "train/perplexity": 10.83353191933926, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702828.2877688974, "perf/iters_per_sec": 0.8119718016476142, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2315698623657227, "data/tokens_consumed": 6775898112, "data/tokens_consumed_B": 6.775898112, "train/loss_slope": -1.6279135328350764e-05} {"step": 3240, "timestamp": 1778329220.6094525, "train/loss": 2.370723271369934, "train/z_loss": 0.001343327190261334, "train/perplexity": 10.705132200375614, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024377.4310558445, "perf/iters_per_sec": 0.9652983813552115, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359491109848022, "data/tokens_consumed": 6796869632, "data/tokens_consumed_B": 6.796869632, "train/loss_slope": -1.6779766646441683e-05} {"step": 3250, "timestamp": 1778329230.9567087, "grad/layer_0/attn": 0.0027752816677093506, "grad/layer_0/mlp": 0.0029724014457315207, "grad/layer_0/attn_mlp_ratio": 0.9336832944710174, "grad/layer_4/attn": 0.0017840361688286066, "grad/layer_4/mlp": 0.00259154150262475, "grad/layer_4/attn_mlp_ratio": 0.6884073043711547, "grad/layer_8/attn": 0.004275492392480373, "grad/layer_8/mlp": 0.003459523431956768, "grad/layer_8/attn_mlp_ratio": 1.235861630362167, "grad/layer_12/attn": 0.006782113574445248, "grad/layer_12/mlp": 0.006380496546626091, "grad/layer_12/attn_mlp_ratio": 1.0629444618596462, "grad/layer_16/attn": 0.005602908320724964, "grad/layer_16/mlp": 0.004084920976310968, "grad/layer_16/attn_mlp_ratio": 1.3716074842221597, "grad/layer_20/attn": 0.004987138323485851, "grad/layer_20/mlp": 0.006151371169835329, "grad/layer_20/attn_mlp_ratio": 0.810736030182642, "grad/layer_24/attn": 0.015311835333704948, "grad/layer_24/mlp": 0.009710741229355335, "grad/layer_24/attn_mlp_ratio": 1.5767936570833827, "grad/layer_27/attn": 0.010931728407740593, "grad/layer_27/mlp": 0.00970388948917389, "grad/layer_27/attn_mlp_ratio": 1.1265305841831237} {"step": 3250, "timestamp": 1778329230.9723232, "train/loss": 2.3856625080108644, "train/z_loss": 0.0013581842416897417, "train/perplexity": 10.866259264565738, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024980.019696613, "perf/iters_per_sec": 0.9655857180102411, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035640835762024, "data/tokens_consumed": 6817841152, "data/tokens_consumed_B": 6.817841152, "train/loss_slope": -1.6894869828703552e-05} {"step": 3260, "timestamp": 1778329241.3227441, "train/loss": 2.385766696929932, "train/z_loss": 0.0013496170518919825, "train/perplexity": 10.867391467353286, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027604.4084688055, "perf/iters_per_sec": 0.9668371240943935, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343003749847413, "data/tokens_consumed": 6838812672, "data/tokens_consumed_B": 6.838812672, "train/loss_slope": -1.5030637947675473e-05} {"step": 3270, "timestamp": 1778329251.6812599, "train/loss": 2.3900420665740967, "train/z_loss": 0.0013532456359826028, "train/perplexity": 10.913953045999959, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025558.8050757167, "perf/iters_per_sec": 0.9658617043856224, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035344910621643, "data/tokens_consumed": 6859784192, "data/tokens_consumed_B": 6.859784192, "train/loss_slope": -1.689249680201793e-05} {"step": 3280, "timestamp": 1778329262.0292017, "train/loss": 2.3541452169418333, "train/z_loss": 0.001338598702568561, "train/perplexity": 10.529124896446428, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027934.063377572, "perf/iters_per_sec": 0.9669943158042774, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341322422027588, "data/tokens_consumed": 6880755712, "data/tokens_consumed_B": 6.880755712, "train/loss_slope": -1.9486394301928757e-05} {"step": 3290, "timestamp": 1778329272.372599, "train/loss": 2.419647216796875, "train/z_loss": 0.0013555305893532931, "train/perplexity": 11.24189266433583, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028504.0600803525, "perf/iters_per_sec": 0.9672661114122164, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338416576385498, "data/tokens_consumed": 6901727232, "data/tokens_consumed_B": 6.901727232, "train/loss_slope": -1.7414135479404974e-05} {"step": 3300, "timestamp": 1778329283.1410763, "grad/layer_0/attn": 0.0026378489565104246, "grad/layer_0/mlp": 0.0029152128845453262, "grad/layer_0/attn_mlp_ratio": 0.904856341713173, "grad/layer_4/attn": 0.0025039133615791798, "grad/layer_4/mlp": 0.002440424170345068, "grad/layer_4/attn_mlp_ratio": 1.0260155957329233, "grad/layer_8/attn": 0.004420969169586897, "grad/layer_8/mlp": 0.0035544217098504305, "grad/layer_8/attn_mlp_ratio": 1.2437941825967274, "grad/layer_12/attn": 0.0057495455257594585, "grad/layer_12/mlp": 0.006684182211756706, "grad/layer_12/attn_mlp_ratio": 0.8601718591138172, "grad/layer_16/attn": 0.007757006213068962, "grad/layer_16/mlp": 0.004281728062778711, "grad/layer_16/attn_mlp_ratio": 1.811653126534521, "grad/layer_20/attn": 0.0028623060788959265, "grad/layer_20/mlp": 0.005551574751734734, "grad/layer_20/attn_mlp_ratio": 0.5155845242726981, "grad/layer_24/attn": 0.009077543392777443, "grad/layer_24/mlp": 0.007635889574885368, "grad/layer_24/attn_mlp_ratio": 1.1887997049818189, "grad/layer_27/attn": 0.005776792764663696, "grad/layer_27/mlp": 0.007201578933745623, "grad/layer_27/attn_mlp_ratio": 0.8021564073093456} {"step": 3300, "timestamp": 1778329283.7410142, "eos/sharpness": 40.41817188262939, "eos/L0_probe": 2.378251791000366, "eos/L_plus": 2.543213129043579, "eos/L_minus": 2.6174721717834473, "eos/grad_norm": 0.11066564917564392, "eos/embed_grad_frac": 0.17577126622200012, "eos/time_s": 0.5969600677490234} {"step": 3300, "timestamp": 1778329283.763886, "train/loss": 2.368594765663147, "train/z_loss": 0.0013469778932631016, "train/perplexity": 10.682370498195256, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1841683.2116036038, "perf/iters_per_sec": 0.878182988931467, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1387148380279541, "data/tokens_consumed": 6922698752, "data/tokens_consumed_B": 6.922698752, "train/loss_slope": -1.9349736291797914e-05} {"step": 3300, "timestamp": 1778329285.1304924, "geo/rankme_last": 426.3678894042969, "geo/layer_0/stable_rank_q_proj": 20.41961669921875, "geo/layer_0/stable_rank_k_proj": 17.180795669555664, "geo/layer_0/stable_rank_o_proj": 45.579437255859375, "geo/layer_0/stable_rank_gate_proj": 130.5467071533203, "geo/layer_0/stable_rank_down_proj": 56.062862396240234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06467058509588242, "geo/layer_0/attn_entropy_mean": 6.260108470916748, "geo/layer_0/attn_entropy_std": 0.4233682453632355, "geo/layer_7/stable_rank_q_proj": 42.541595458984375, "geo/layer_7/stable_rank_k_proj": 39.65092849731445, "geo/layer_7/stable_rank_o_proj": 89.71906280517578, "geo/layer_7/stable_rank_gate_proj": 79.80142974853516, "geo/layer_7/stable_rank_down_proj": 143.3006134033203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43025773763656616, "geo/layer_7/attn_entropy_mean": 4.761535167694092, "geo/layer_7/attn_entropy_std": 0.7658824324607849, "geo/layer_14/stable_rank_q_proj": 51.40542984008789, "geo/layer_14/stable_rank_k_proj": 41.62202835083008, "geo/layer_14/stable_rank_o_proj": 42.790931701660156, "geo/layer_14/stable_rank_gate_proj": 71.53594970703125, "geo/layer_14/stable_rank_down_proj": 127.43087005615234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38739877939224243, "geo/layer_14/attn_entropy_mean": 5.520912170410156, "geo/layer_14/attn_entropy_std": 0.3997715413570404, "geo/layer_21/stable_rank_q_proj": 39.929691314697266, "geo/layer_21/stable_rank_k_proj": 29.464767456054688, "geo/layer_21/stable_rank_o_proj": 66.64501953125, "geo/layer_21/stable_rank_gate_proj": 63.12128829956055, "geo/layer_21/stable_rank_down_proj": 49.885379791259766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13937422633171082, "geo/layer_21/attn_entropy_mean": 5.862290382385254, "geo/layer_21/attn_entropy_std": 0.32065531611442566, "geo/layer_27/stable_rank_q_proj": 43.737789154052734, "geo/layer_27/stable_rank_k_proj": 30.96494483947754, "geo/layer_27/stable_rank_o_proj": 111.83308410644531, "geo/layer_27/stable_rank_gate_proj": 74.56383514404297, "geo/layer_27/stable_rank_down_proj": 127.51390838623047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11222134530544281, "geo/layer_27/attn_entropy_mean": 4.31483793258667, "geo/layer_27/attn_entropy_std": 0.6442780494689941, "attnres/final_alpha/block_0": 0.25117695331573486, "attnres/block_norm/0": 1.7789229154586792, "attnres/final_alpha/block_1": 0.004095025360584259, "attnres/block_norm/1": 49462.64453125, "attnres/final_alpha/block_2": 0.008885241113603115, "attnres/block_norm/2": 29605.87109375, "attnres/final_alpha/block_3": 0.010713658295571804, "attnres/block_norm/3": 67739.1875, "attnres/final_alpha/block_4": 0.012336600571870804, "attnres/block_norm/4": 16633.5859375, "attnres/final_alpha/block_5": 0.6087223887443542, "attnres/block_norm/5": 6912.408203125, "attnres/final_alpha/block_6": 0.10407011955976486, "attnres/block_norm/6": 45071.0390625, "geo/tier1_time_s": 1.3623056411743164, "geo/step": 3300.0, "geo/rankme_slope": 0.002114173912614597} {"step": 3310, "timestamp": 1778329295.4826958, "train/loss": 2.3946698904037476, "train/z_loss": 0.0013443997711874544, "train/perplexity": 10.964577949196899, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790181.2572340947, "perf/iters_per_sec": 0.8536249433680032, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1714746713638307, "data/tokens_consumed": 6943670272, "data/tokens_consumed_B": 6.943670272, "train/loss_slope": -1.813748271265735e-05} {"step": 3320, "timestamp": 1778329305.8305833, "train/loss": 2.3895776748657225, "train/z_loss": 0.0013504870468750596, "train/perplexity": 10.908885873367355, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027883.3366646122, "perf/iters_per_sec": 0.9669701274226247, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341581106185913, "data/tokens_consumed": 6964641792, "data/tokens_consumed_B": 6.964641792, "train/loss_slope": -1.6205896917778734e-05} {"step": 3330, "timestamp": 1778329316.1761773, "train/loss": 2.373670244216919, "train/z_loss": 0.0013511470519006253, "train/perplexity": 10.736726465148285, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028183.4793539285, "perf/iters_per_sec": 0.9671132466096537, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034005069732666, "data/tokens_consumed": 6985613312, "data/tokens_consumed_B": 6.985613312, "train/loss_slope": -1.7767473358740393e-05} {"step": 3340, "timestamp": 1778329326.549701, "train/loss": 2.3913965940475466, "train/z_loss": 0.0013559739803895355, "train/perplexity": 10.928746311925446, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022671.577397643, "perf/iters_per_sec": 0.9644849669445242, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03682279586792, "data/tokens_consumed": 7006584832, "data/tokens_consumed_B": 7.006584832, "train/loss_slope": -1.7375722734054795e-05} {"step": 3350, "timestamp": 1778329336.8923357, "grad/layer_0/attn": 0.0029384458903223276, "grad/layer_0/mlp": 0.003046783385798335, "grad/layer_0/attn_mlp_ratio": 0.9644419775868582, "grad/layer_4/attn": 0.0028064793441444635, "grad/layer_4/mlp": 0.0026921385433524847, "grad/layer_4/attn_mlp_ratio": 1.042472069956097, "grad/layer_8/attn": 0.0033199028111994267, "grad/layer_8/mlp": 0.0035140810068696737, "grad/layer_8/attn_mlp_ratio": 0.9447427962631119, "grad/layer_12/attn": 0.006495070643723011, "grad/layer_12/mlp": 0.007232590112835169, "grad/layer_12/attn_mlp_ratio": 0.8980282931274977, "grad/layer_16/attn": 0.0044313943944871426, "grad/layer_16/mlp": 0.005015057511627674, "grad/layer_16/attn_mlp_ratio": 0.8836178440328826, "grad/layer_20/attn": 0.0046118092723190784, "grad/layer_20/mlp": 0.007058307062834501, "grad/layer_20/attn_mlp_ratio": 0.6533874434655591, "grad/layer_24/attn": 0.018532603979110718, "grad/layer_24/mlp": 0.013724073767662048, "grad/layer_24/attn_mlp_ratio": 1.3503719200155997, "grad/layer_27/attn": 0.007024839986115694, "grad/layer_27/mlp": 0.013188734650611877, "grad/layer_27/attn_mlp_ratio": 0.5326394168166726} {"step": 3350, "timestamp": 1778329336.9084125, "train/loss": 2.3684523344039916, "train/z_loss": 0.0013588594854809343, "train/perplexity": 10.68084910306412, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025767.0472696319, "perf/iters_per_sec": 0.9659610020015869, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352384805679322, "data/tokens_consumed": 7027556352, "data/tokens_consumed_B": 7.027556352, "train/loss_slope": -1.9538078385360897e-05} {"step": 3360, "timestamp": 1778329347.7567294, "train/loss": 2.3706137418746946, "train/z_loss": 0.0013576679630205035, "train/perplexity": 10.703959736860076, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933850.5710840423, "perf/iters_per_sec": 0.9221318107052051, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0844436645507813, "data/tokens_consumed": 7048527872, "data/tokens_consumed_B": 7.048527872, "train/loss_slope": -1.819154407181908e-05} {"step": 3370, "timestamp": 1778329358.1162043, "train/loss": 2.399100661277771, "train/z_loss": 0.0013538723113015294, "train/perplexity": 11.013267267772866, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025684.333046802, "perf/iters_per_sec": 0.9659215607866297, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352807521820069, "data/tokens_consumed": 7069499392, "data/tokens_consumed_B": 7.069499392, "train/loss_slope": -1.9433278033156182e-05} {"step": 3375, "timestamp": 1778329363.8742032, "eos/sharpness": 55.34160137176512, "eos/L0_probe": 2.3753116130828857, "eos/L_plus": 2.6174156665802, "eos/L_minus": 2.6866235733032227, "eos/grad_norm": 0.19705024361610413, "eos/embed_grad_frac": 0.07305073738098145, "eos/time_s": 0.5931165218353271} {"step": 3375, "timestamp": 1778329365.2525826, "geo/rankme_last": 425.4456787109375, "geo/layer_0/stable_rank_q_proj": 20.431989669799805, "geo/layer_0/stable_rank_k_proj": 17.16912269592285, "geo/layer_0/stable_rank_o_proj": 45.54934310913086, "geo/layer_0/stable_rank_gate_proj": 130.36622619628906, "geo/layer_0/stable_rank_down_proj": 56.08363342285156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062202464789152145, "geo/layer_0/attn_entropy_mean": 6.257774353027344, "geo/layer_0/attn_entropy_std": 0.42322906851768494, "geo/layer_7/stable_rank_q_proj": 42.476348876953125, "geo/layer_7/stable_rank_k_proj": 39.64887237548828, "geo/layer_7/stable_rank_o_proj": 89.70384216308594, "geo/layer_7/stable_rank_gate_proj": 79.83644104003906, "geo/layer_7/stable_rank_down_proj": 143.10240173339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41659677028656006, "geo/layer_7/attn_entropy_mean": 4.744041442871094, "geo/layer_7/attn_entropy_std": 0.7643498778343201, "geo/layer_14/stable_rank_q_proj": 51.45557403564453, "geo/layer_14/stable_rank_k_proj": 41.57343673706055, "geo/layer_14/stable_rank_o_proj": 42.804595947265625, "geo/layer_14/stable_rank_gate_proj": 71.54641723632812, "geo/layer_14/stable_rank_down_proj": 127.25545501708984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36640816926956177, "geo/layer_14/attn_entropy_mean": 5.5428571701049805, "geo/layer_14/attn_entropy_std": 0.42078179121017456, "geo/layer_21/stable_rank_q_proj": 39.88718032836914, "geo/layer_21/stable_rank_k_proj": 29.435449600219727, "geo/layer_21/stable_rank_o_proj": 66.5953140258789, "geo/layer_21/stable_rank_gate_proj": 63.093605041503906, "geo/layer_21/stable_rank_down_proj": 49.843509674072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13270168006420135, "geo/layer_21/attn_entropy_mean": 5.8692193031311035, "geo/layer_21/attn_entropy_std": 0.31106775999069214, "geo/layer_27/stable_rank_q_proj": 43.78645706176758, "geo/layer_27/stable_rank_k_proj": 30.986427307128906, "geo/layer_27/stable_rank_o_proj": 111.46683502197266, "geo/layer_27/stable_rank_gate_proj": 74.55143737792969, "geo/layer_27/stable_rank_down_proj": 127.48213958740234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09388621151447296, "geo/layer_27/attn_entropy_mean": 4.331840991973877, "geo/layer_27/attn_entropy_std": 0.6435710787773132, "attnres/final_alpha/block_0": 0.2501496374607086, "attnres/block_norm/0": 1.7790601253509521, "attnres/final_alpha/block_1": 0.004075564444065094, "attnres/block_norm/1": 49377.546875, "attnres/final_alpha/block_2": 0.008820777758955956, "attnres/block_norm/2": 29516.078125, "attnres/final_alpha/block_3": 0.010752718895673752, "attnres/block_norm/3": 67565.1796875, "attnres/final_alpha/block_4": 0.01246571820229292, "attnres/block_norm/4": 16624.884765625, "attnres/final_alpha/block_5": 0.6088399291038513, "attnres/block_norm/5": 6956.12109375, "attnres/final_alpha/block_6": 0.10489565134048462, "attnres/block_norm/6": 45352.3359375, "geo/tier1_time_s": 1.359527587890625, "geo/step": 3375.0, "geo/rankme_slope": 0.0020878026952281838} {"step": 3380, "timestamp": 1778329370.4378035, "train/loss": 2.4000057935714723, "train/z_loss": 0.001355225360020995, "train/perplexity": 11.023240244386814, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702643.4076618878, "perf/iters_per_sec": 0.8118836439427795, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2317035913467407, "data/tokens_consumed": 7090470912, "data/tokens_consumed_B": 7.090470912, "train/loss_slope": -1.8947158688151287e-05} {"step": 3390, "timestamp": 1778329380.7924426, "train/loss": 2.3673153638839723, "train/z_loss": 0.0013489436125382781, "train/perplexity": 10.66871219346682, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026311.0393732307, "perf/iters_per_sec": 0.9662203976503518, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349605560302735, "data/tokens_consumed": 7111442432, "data/tokens_consumed_B": 7.111442432, "train/loss_slope": -1.8734620864277647e-05} {"step": 3400, "timestamp": 1778329391.1358552, "grad/layer_0/attn": 0.0034956096205860376, "grad/layer_0/mlp": 0.0034407551866024733, "grad/layer_0/attn_mlp_ratio": 1.0159425269786417, "grad/layer_4/attn": 0.0025604746770113707, "grad/layer_4/mlp": 0.002642095321789384, "grad/layer_4/attn_mlp_ratio": 0.9691075711706374, "grad/layer_8/attn": 0.004509367514401674, "grad/layer_8/mlp": 0.0034271932672709227, "grad/layer_8/attn_mlp_ratio": 1.31576104151733, "grad/layer_12/attn": 0.006276906467974186, "grad/layer_12/mlp": 0.007027823943644762, "grad/layer_12/attn_mlp_ratio": 0.8931507717029956, "grad/layer_16/attn": 0.004320966079831123, "grad/layer_16/mlp": 0.004448127932846546, "grad/layer_16/attn_mlp_ratio": 0.9714122543064377, "grad/layer_20/attn": 0.0033445176668465137, "grad/layer_20/mlp": 0.005348182283341885, "grad/layer_20/attn_mlp_ratio": 0.6253559484552669, "grad/layer_24/attn": 0.004869627766311169, "grad/layer_24/mlp": 0.00770649965852499, "grad/layer_24/attn_mlp_ratio": 0.6318858001551676, "grad/layer_27/attn": 0.004034614190459251, "grad/layer_27/mlp": 0.006690471898764372, "grad/layer_27/attn_mlp_ratio": 0.6030387977416815} {"step": 3400, "timestamp": 1778329391.1515002, "train/loss": 2.3904166221618652, "train/z_loss": 0.0013446970144286752, "train/perplexity": 10.918041693763115, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025658.0693117315, "perf/iters_per_sec": 0.9659090372618349, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352941751480103, "data/tokens_consumed": 7132413952, "data/tokens_consumed_B": 7.132413952, "train/loss_slope": -2.0807142297748813e-05} {"step": 3410, "timestamp": 1778329401.5013103, "train/loss": 2.403578996658325, "train/z_loss": 0.001348866696935147, "train/perplexity": 11.062698975501732, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027162.0297226838, "perf/iters_per_sec": 0.9666261814702434, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345260858535767, "data/tokens_consumed": 7153385472, "data/tokens_consumed_B": 7.153385472, "train/loss_slope": -2.0719751938305755e-05} {"step": 3420, "timestamp": 1778329411.8538642, "train/loss": 2.3968960523605345, "train/z_loss": 0.0013484748895280063, "train/perplexity": 10.989014064783369, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026933.1358579968, "perf/iters_per_sec": 0.9665170363702759, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346429109573365, "data/tokens_consumed": 7174356992, "data/tokens_consumed_B": 7.174356992, "train/loss_slope": -1.9236939982755644e-05} {"step": 3430, "timestamp": 1778329422.2146416, "train/loss": 2.3791687965393065, "train/z_loss": 0.0013449776568450033, "train/perplexity": 10.795925522793683, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025556.0530574012, "perf/iters_per_sec": 0.9658603921210295, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353463172912598, "data/tokens_consumed": 7195328512, "data/tokens_consumed_B": 7.195328512, "train/loss_slope": -1.848580950986796e-05} {"step": 3440, "timestamp": 1778329432.5747242, "train/loss": 2.3780689239501953, "train/z_loss": 0.0013472784543409944, "train/perplexity": 10.784057907865629, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025178.7708363158, "perf/iters_per_sec": 0.9656804899388866, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035539197921753, "data/tokens_consumed": 7216300032, "data/tokens_consumed_B": 7.216300032, "train/loss_slope": -1.582832714118583e-05} {"step": 3450, "timestamp": 1778329442.9167533, "grad/layer_0/attn": 0.002955891890451312, "grad/layer_0/mlp": 0.0033203186467289925, "grad/layer_0/attn_mlp_ratio": 0.8902434121312314, "grad/layer_4/attn": 0.0021174424327909946, "grad/layer_4/mlp": 0.0025829493533819914, "grad/layer_4/attn_mlp_ratio": 0.81977695305594, "grad/layer_8/attn": 0.004184391349554062, "grad/layer_8/mlp": 0.0034361735451966524, "grad/layer_8/attn_mlp_ratio": 1.2177473497019935, "grad/layer_12/attn": 0.005681444425135851, "grad/layer_12/mlp": 0.006599901244044304, "grad/layer_12/attn_mlp_ratio": 0.8608377805923931, "grad/layer_16/attn": 0.005229727830737829, "grad/layer_16/mlp": 0.004844465758651495, "grad/layer_16/attn_mlp_ratio": 1.0795262023362833, "grad/layer_20/attn": 0.0035920387599617243, "grad/layer_20/mlp": 0.00739218108355999, "grad/layer_20/attn_mlp_ratio": 0.4859240690623657, "grad/layer_24/attn": 0.014247752726078033, "grad/layer_24/mlp": 0.012910846620798111, "grad/layer_24/attn_mlp_ratio": 1.1035490571758007, "grad/layer_27/attn": 0.007329428102821112, "grad/layer_27/mlp": 0.014070011675357819, "grad/layer_27/attn_mlp_ratio": 0.5209255130587632} {"step": 3450, "timestamp": 1778329443.5150723, "eos/sharpness": 46.94170951843261, "eos/L0_probe": 2.3782835006713867, "eos/L_plus": 2.608919858932495, "eos/L_minus": 2.6170642375946045, "eos/grad_norm": 0.19492322206497192, "eos/embed_grad_frac": 0.07635767757892609, "eos/time_s": 0.5953412055969238} {"step": 3450, "timestamp": 1778329443.5368855, "train/loss": 2.4196813344955443, "train/z_loss": 0.0013562796404585243, "train/perplexity": 11.242276218385179, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913999.7443692805, "perf/iters_per_sec": 0.9126661989065554, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0956908464431763, "data/tokens_consumed": 7237271552, "data/tokens_consumed_B": 7.237271552, "train/loss_slope": -1.5397357697462597e-05} {"step": 3450, "timestamp": 1778329444.9004533, "geo/rankme_last": 426.7439880371094, "geo/layer_0/stable_rank_q_proj": 20.430185317993164, "geo/layer_0/stable_rank_k_proj": 17.158958435058594, "geo/layer_0/stable_rank_o_proj": 45.567867279052734, "geo/layer_0/stable_rank_gate_proj": 130.3190155029297, "geo/layer_0/stable_rank_down_proj": 56.11412811279297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06349964439868927, "geo/layer_0/attn_entropy_mean": 6.253997325897217, "geo/layer_0/attn_entropy_std": 0.42405277490615845, "geo/layer_7/stable_rank_q_proj": 42.42361831665039, "geo/layer_7/stable_rank_k_proj": 39.63812255859375, "geo/layer_7/stable_rank_o_proj": 89.74059295654297, "geo/layer_7/stable_rank_gate_proj": 79.79708862304688, "geo/layer_7/stable_rank_down_proj": 143.20907592773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.424993634223938, "geo/layer_7/attn_entropy_mean": 4.711382865905762, "geo/layer_7/attn_entropy_std": 0.782036542892456, "geo/layer_14/stable_rank_q_proj": 51.50210189819336, "geo/layer_14/stable_rank_k_proj": 41.55223083496094, "geo/layer_14/stable_rank_o_proj": 42.745445251464844, "geo/layer_14/stable_rank_gate_proj": 71.55049896240234, "geo/layer_14/stable_rank_down_proj": 127.56291961669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38879096508026123, "geo/layer_14/attn_entropy_mean": 5.558716773986816, "geo/layer_14/attn_entropy_std": 0.44562026858329773, "geo/layer_21/stable_rank_q_proj": 39.86897659301758, "geo/layer_21/stable_rank_k_proj": 29.417530059814453, "geo/layer_21/stable_rank_o_proj": 66.54815673828125, "geo/layer_21/stable_rank_gate_proj": 63.0722770690918, "geo/layer_21/stable_rank_down_proj": 49.7952995300293, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1403040736913681, "geo/layer_21/attn_entropy_mean": 5.840000152587891, "geo/layer_21/attn_entropy_std": 0.317259818315506, "geo/layer_27/stable_rank_q_proj": 43.81232833862305, "geo/layer_27/stable_rank_k_proj": 31.009408950805664, "geo/layer_27/stable_rank_o_proj": 111.22755432128906, "geo/layer_27/stable_rank_gate_proj": 74.5291976928711, "geo/layer_27/stable_rank_down_proj": 127.41189575195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09840401262044907, "geo/layer_27/attn_entropy_mean": 4.310041427612305, "geo/layer_27/attn_entropy_std": 0.6557700634002686, "attnres/final_alpha/block_0": 0.2515295743942261, "attnres/block_norm/0": 1.7790087461471558, "attnres/final_alpha/block_1": 0.004056092817336321, "attnres/block_norm/1": 49651.63671875, "attnres/final_alpha/block_2": 0.00880216620862484, "attnres/block_norm/2": 29665.123046875, "attnres/final_alpha/block_3": 0.010721813887357712, "attnres/block_norm/3": 67929.1875, "attnres/final_alpha/block_4": 0.012545639649033546, "attnres/block_norm/4": 16550.345703125, "attnres/final_alpha/block_5": 0.6080706119537354, "attnres/block_norm/5": 7018.052734375, "attnres/final_alpha/block_6": 0.10427407920360565, "attnres/block_norm/6": 45108.4921875, "geo/tier1_time_s": 1.3592743873596191, "geo/step": 3450.0, "geo/rankme_slope": 0.002105144414863504} {"step": 3460, "timestamp": 1778329455.259829, "train/loss": 2.3783827304840086, "train/z_loss": 0.0013684516190551222, "train/perplexity": 10.787442546731258, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789552.7038771014, "perf/iters_per_sec": 0.8533252257714755, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171886134147644, "data/tokens_consumed": 7258243072, "data/tokens_consumed_B": 7.258243072, "train/loss_slope": -1.5192744497991108e-05} {"step": 3470, "timestamp": 1778329465.6092272, "train/loss": 2.3814084053039553, "train/z_loss": 0.0013558664708398283, "train/perplexity": 10.820131267798667, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027332.6593471575, "perf/iters_per_sec": 0.9667075440154826, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344390153884888, "data/tokens_consumed": 7279214592, "data/tokens_consumed_B": 7.279214592, "train/loss_slope": -1.7291606255848103e-05} {"step": 3480, "timestamp": 1778329475.9624527, "train/loss": 2.3893330335617065, "train/z_loss": 0.0013488356256857515, "train/perplexity": 10.906217435720274, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026885.5884746367, "perf/iters_per_sec": 0.9664943640111144, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034667181968689, "data/tokens_consumed": 7300186112, "data/tokens_consumed_B": 7.300186112, "train/loss_slope": -1.8921867877629484e-05} {"step": 3490, "timestamp": 1778329486.3150005, "train/loss": 2.3754441022872923, "train/z_loss": 0.0013544354354962707, "train/perplexity": 10.755788795974151, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027031.0400497322, "perf/iters_per_sec": 0.9665637207268392, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345929384231567, "data/tokens_consumed": 7321157632, "data/tokens_consumed_B": 7.321157632, "train/loss_slope": -1.763374032181667e-05} {"step": 3500, "timestamp": 1778329496.662699, "grad/layer_0/attn": 0.002984000137075782, "grad/layer_0/mlp": 0.0033236381132155657, "grad/layer_0/attn_mlp_ratio": 0.8978113578098533, "grad/layer_4/attn": 0.0021683438681066036, "grad/layer_4/mlp": 0.002740823430940509, "grad/layer_4/attn_mlp_ratio": 0.7911285946098617, "grad/layer_8/attn": 0.004938244819641113, "grad/layer_8/mlp": 0.003665633499622345, "grad/layer_8/attn_mlp_ratio": 1.3471736018978766, "grad/layer_12/attn": 0.008593021892011166, "grad/layer_12/mlp": 0.007620664779096842, "grad/layer_12/attn_mlp_ratio": 1.1275947739916314, "grad/layer_16/attn": 0.0037936086300760508, "grad/layer_16/mlp": 0.004599902778863907, "grad/layer_16/attn_mlp_ratio": 0.8247149407234885, "grad/layer_20/attn": 0.004151877481490374, "grad/layer_20/mlp": 0.0062517570331692696, "grad/layer_20/attn_mlp_ratio": 0.664113687248375, "grad/layer_24/attn": 0.007895763032138348, "grad/layer_24/mlp": 0.00862873811274767, "grad/layer_24/attn_mlp_ratio": 0.9150541872360364, "grad/layer_27/attn": 0.005815746728330851, "grad/layer_27/mlp": 0.008443372324109077, "grad/layer_27/attn_mlp_ratio": 0.6887942916890242} {"step": 3500, "timestamp": 1778329496.678415, "train/loss": 2.346107244491577, "train/z_loss": 0.0013543343753553926, "train/perplexity": 10.444831309212661, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024544.796344939, "perf/iters_per_sec": 0.9653781873440451, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035863471031189, "data/tokens_consumed": 7342129152, "data/tokens_consumed_B": 7.342129152, "train/loss_slope": -2.112226403228098e-05} {"step": 3500, "timestamp": 1778329503.7538514, "geo/ww_alpha_mean": 7.532442803688726, "geo/ww_alpha_std": 4.529412205967561, "geo/ww_alpha_min": 1.3504472204050073, "geo/ww_alpha_max": 31.26825685919383, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.9060667335473456, "geo/ww_alpha_by_type/k_proj": 4.316138367951234, "geo/ww_alpha_by_type/v_proj": 8.45884922079703, "geo/ww_alpha_by_type/o_proj": 8.814156318535144, "geo/ww_alpha_by_type/gate_proj": 8.06392371849471, "geo/ww_alpha_by_type/up_proj": 11.321919757701224, "geo/ww_alpha_by_type/down_proj": 7.9364406841762305, "geo/twonn_id/layer_0": 0.783310055732727, "geo/twonn_id/layer_7": 3.634734630584717, "geo/twonn_id/layer_14": 5.849486351013184, "geo/twonn_id/layer_21": 9.056777954101562, "geo/twonn_id/layer_27": 5.705541610717773, "geo/tier2_time_s": 7.069154739379883} {"step": 3500, "timestamp": 1778329504.511132, "eoc/jacobian_sigma/layer_0/attn": 1393.3792724609375, "eoc/jacobian_sigma/layer_0/mlp": 11454.16015625, "eoc/jacobian_sigma/layer_0": 11454.16015625, "eoc/jacobian_sigma/layer_7/attn": 1.0887439250946045, "eoc/jacobian_sigma/layer_7/mlp": 1.8293895721435547, "eoc/jacobian_sigma/layer_7": 1.8293895721435547, "eoc/jacobian_sigma/layer_14/attn": 1.9546831846237183, "eoc/jacobian_sigma/layer_14/mlp": 12.943964958190918, "eoc/jacobian_sigma/layer_14": 12.943964958190918, "eoc/jacobian_sigma/layer_21/attn": 1.0510971546173096, "eoc/jacobian_sigma/layer_21/mlp": 4.549111366271973, "eoc/jacobian_sigma/layer_21": 4.549111366271973, "eoc/jacobian_sigma/layer_27/attn": 3.4212911128997803, "eoc/jacobian_sigma/layer_27/mlp": 29.15262222290039, "eoc/jacobian_sigma/layer_27": 29.15262222290039, "eoc/layer0_sigma": 11454.16015625, "eoc/sigma_max": 29.15262222290039, "eoc/sigma_min": 1.8293895721435547, "eoc/sigma_mean": 12.118772029876709, "eoc/time_s": 0.7493999004364014} {"step": 3510, "timestamp": 1778329514.8841949, "train/loss": 2.3678404808044435, "train/z_loss": 0.0013612203416414558, "train/perplexity": 10.674315985953603, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1152232.8030330073, "perf/iters_per_sec": 0.5494274153866803, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8200766324996949, "data/tokens_consumed": 7363100672, "data/tokens_consumed_B": 7.363100672, "train/loss_slope": -2.246324690547826e-05} {"step": 3520, "timestamp": 1778329525.24279, "train/loss": 2.3668689489364625, "train/z_loss": 0.0013499719323590397, "train/perplexity": 10.663950583778922, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025477.8802027705, "perf/iters_per_sec": 0.9658231163991787, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353862762451171, "data/tokens_consumed": 7384072192, "data/tokens_consumed_B": 7.384072192, "train/loss_slope": -2.206087727608213e-05} {"step": 3525, "timestamp": 1778329531.016519, "eos/sharpness": 61.68851852416991, "eos/L0_probe": 2.3704545497894287, "eos/L_plus": 2.740485191345215, "eos/L_minus": 2.617309093475342, "eos/grad_norm": 0.22373844683170319, "eos/embed_grad_frac": 0.050881046801805496, "eos/time_s": 0.6056656837463379} {"step": 3525, "timestamp": 1778329532.3970227, "geo/rankme_last": 425.8962707519531, "geo/layer_0/stable_rank_q_proj": 20.475149154663086, "geo/layer_0/stable_rank_k_proj": 17.16702651977539, "geo/layer_0/stable_rank_o_proj": 45.53195571899414, "geo/layer_0/stable_rank_gate_proj": 130.39488220214844, "geo/layer_0/stable_rank_down_proj": 56.12006378173828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07034707814455032, "geo/layer_0/attn_entropy_mean": 6.2605366706848145, "geo/layer_0/attn_entropy_std": 0.42217326164245605, "geo/layer_7/stable_rank_q_proj": 42.459938049316406, "geo/layer_7/stable_rank_k_proj": 39.733463287353516, "geo/layer_7/stable_rank_o_proj": 89.62835693359375, "geo/layer_7/stable_rank_gate_proj": 79.7472152709961, "geo/layer_7/stable_rank_down_proj": 143.0758514404297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41925248503685, "geo/layer_7/attn_entropy_mean": 4.723761558532715, "geo/layer_7/attn_entropy_std": 0.7984281778335571, "geo/layer_14/stable_rank_q_proj": 51.59831237792969, "geo/layer_14/stable_rank_k_proj": 41.50319290161133, "geo/layer_14/stable_rank_o_proj": 42.71689987182617, "geo/layer_14/stable_rank_gate_proj": 71.56422424316406, "geo/layer_14/stable_rank_down_proj": 127.80422973632812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37559986114501953, "geo/layer_14/attn_entropy_mean": 5.540989875793457, "geo/layer_14/attn_entropy_std": 0.4306720793247223, "geo/layer_21/stable_rank_q_proj": 39.87408447265625, "geo/layer_21/stable_rank_k_proj": 29.382904052734375, "geo/layer_21/stable_rank_o_proj": 66.55960845947266, "geo/layer_21/stable_rank_gate_proj": 63.14601516723633, "geo/layer_21/stable_rank_down_proj": 49.84476852416992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13607050478458405, "geo/layer_21/attn_entropy_mean": 5.867516040802002, "geo/layer_21/attn_entropy_std": 0.3219549357891083, "geo/layer_27/stable_rank_q_proj": 43.8316650390625, "geo/layer_27/stable_rank_k_proj": 31.007556915283203, "geo/layer_27/stable_rank_o_proj": 111.21615600585938, "geo/layer_27/stable_rank_gate_proj": 74.35102081298828, "geo/layer_27/stable_rank_down_proj": 127.31330871582031, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1082329973578453, "geo/layer_27/attn_entropy_mean": 4.304093360900879, "geo/layer_27/attn_entropy_std": 0.6650873422622681, "attnres/final_alpha/block_0": 0.2482709288597107, "attnres/block_norm/0": 1.7788183689117432, "attnres/final_alpha/block_1": 0.0039561158046126366, "attnres/block_norm/1": 49637.72265625, "attnres/final_alpha/block_2": 0.008783379569649696, "attnres/block_norm/2": 29817.041015625, "attnres/final_alpha/block_3": 0.010536272078752518, "attnres/block_norm/3": 68023.046875, "attnres/final_alpha/block_4": 0.012134039774537086, "attnres/block_norm/4": 16670.630859375, "attnres/final_alpha/block_5": 0.6154979467391968, "attnres/block_norm/5": 6868.4111328125, "attnres/final_alpha/block_6": 0.10082133114337921, "attnres/block_norm/6": 45469.453125, "geo/tier1_time_s": 1.359999179840088, "geo/step": 3525.0, "geo/rankme_slope": 0.0020862027100775485} {"step": 3530, "timestamp": 1778329537.5759852, "train/loss": 2.36467387676239, "train/z_loss": 0.0013524416601285339, "train/perplexity": 10.640568115078715, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701348.4944023886, "perf/iters_per_sec": 0.8112661811840003, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.232641053199768, "data/tokens_consumed": 7405043712, "data/tokens_consumed_B": 7.405043712, "train/loss_slope": -2.1096978028758427e-05} {"step": 3540, "timestamp": 1778329547.9250565, "train/loss": 2.3988656044006347, "train/z_loss": 0.0013452377868816256, "train/perplexity": 11.010678827789063, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027263.9741987984, "perf/iters_per_sec": 0.9666747923845284, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344740629196167, "data/tokens_consumed": 7426015232, "data/tokens_consumed_B": 7.426015232, "train/loss_slope": -2.0289121430949594e-05} {"step": 3550, "timestamp": 1778329558.275761, "grad/layer_0/attn": 0.0027870323974639177, "grad/layer_0/mlp": 0.0031738600227981806, "grad/layer_0/attn_mlp_ratio": 0.8781207392992408, "grad/layer_4/attn": 0.0022992424201220274, "grad/layer_4/mlp": 0.00264916499145329, "grad/layer_4/attn_mlp_ratio": 0.8679120933383202, "grad/layer_8/attn": 0.004169507883489132, "grad/layer_8/mlp": 0.003370456164702773, "grad/layer_8/attn_mlp_ratio": 1.2370752076371547, "grad/layer_12/attn": 0.005944545846432447, "grad/layer_12/mlp": 0.006838921457529068, "grad/layer_12/attn_mlp_ratio": 0.8692226978225844, "grad/layer_16/attn": 0.003995060455054045, "grad/layer_16/mlp": 0.004764292389154434, "grad/layer_16/attn_mlp_ratio": 0.8385422314327909, "grad/layer_20/attn": 0.003913519438356161, "grad/layer_20/mlp": 0.006549457553774118, "grad/layer_20/attn_mlp_ratio": 0.5975333600486751, "grad/layer_24/attn": 0.018755575641989708, "grad/layer_24/mlp": 0.014408625662326813, "grad/layer_24/attn_mlp_ratio": 1.3016908032290317, "grad/layer_27/attn": 0.007783319801092148, "grad/layer_27/mlp": 0.014680951833724976, "grad/layer_27/attn_mlp_ratio": 0.5301645176844673} {"step": 3550, "timestamp": 1778329558.291445, "train/loss": 2.410199284553528, "train/z_loss": 0.0013320865924470126, "train/perplexity": 11.136180192644705, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023979.3973704192, "perf/iters_per_sec": 0.9651085841037842, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361528396606445, "data/tokens_consumed": 7446986752, "data/tokens_consumed_B": 7.446986752, "train/loss_slope": -1.8269446201593453e-05} {"step": 3560, "timestamp": 1778329568.6450472, "train/loss": 2.4180423259735107, "train/z_loss": 0.0013411962310783565, "train/perplexity": 11.223865123948082, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026728.6232178987, "perf/iters_per_sec": 0.9664195171441549, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034747314453125, "data/tokens_consumed": 7467958272, "data/tokens_consumed_B": 7.467958272, "train/loss_slope": -1.5251798197703099e-05} {"step": 3570, "timestamp": 1778329578.9972546, "train/loss": 2.4022201776504515, "train/z_loss": 0.0013615622418001294, "train/perplexity": 11.047676978254502, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026767.009975616, "perf/iters_per_sec": 0.9664378213766174, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347277164459228, "data/tokens_consumed": 7488929792, "data/tokens_consumed_B": 7.488929792, "train/loss_slope": -1.6712209551033256e-05} {"step": 3580, "timestamp": 1778329589.3544269, "train/loss": 2.324777889251709, "train/z_loss": 0.0013630894012749196, "train/perplexity": 10.224408882463282, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025819.9543246457, "perf/iters_per_sec": 0.9659862300513485, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035211443901062, "data/tokens_consumed": 7509901312, "data/tokens_consumed_B": 7.509901312, "train/loss_slope": -2.0139098646688733e-05} {"step": 3590, "timestamp": 1778329599.7155313, "train/loss": 2.374605894088745, "train/z_loss": 0.001333965640515089, "train/perplexity": 10.746776983038902, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025528.8131438573, "perf/iters_per_sec": 0.9658474031180655, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353602409362792, "data/tokens_consumed": 7530872832, "data/tokens_consumed_B": 7.530872832, "train/loss_slope": -1.8973874430594935e-05} {"step": 3600, "timestamp": 1778329610.0549698, "grad/layer_0/attn": 0.003025206271559, "grad/layer_0/mlp": 0.003320187795907259, "grad/layer_0/attn_mlp_ratio": 0.9111551413364654, "grad/layer_4/attn": 0.002011209726333618, "grad/layer_4/mlp": 0.00260924338363111, "grad/layer_4/attn_mlp_ratio": 0.7708018584508456, "grad/layer_8/attn": 0.005181341432034969, "grad/layer_8/mlp": 0.0034750390332192183, "grad/layer_8/attn_mlp_ratio": 1.491016714748493, "grad/layer_12/attn": 0.008637672290205956, "grad/layer_12/mlp": 0.006722678430378437, "grad/layer_12/attn_mlp_ratio": 1.2848557686008704, "grad/layer_16/attn": 0.0037501163315027952, "grad/layer_16/mlp": 0.004731182008981705, "grad/layer_16/attn_mlp_ratio": 0.7926383396622066, "grad/layer_20/attn": 0.004397139884531498, "grad/layer_20/mlp": 0.008030885830521584, "grad/layer_20/attn_mlp_ratio": 0.5475286192050192, "grad/layer_24/attn": 0.012148912064731121, "grad/layer_24/mlp": 0.011739120818674564, "grad/layer_24/attn_mlp_ratio": 1.034908162961731, "grad/layer_27/attn": 0.006233790889382362, "grad/layer_27/mlp": 0.01074913702905178, "grad/layer_27/attn_mlp_ratio": 0.579934074199709} {"step": 3600, "timestamp": 1778329610.6535714, "eos/sharpness": 57.621860504150376, "eos/L0_probe": 2.3699188232421875, "eos/L_plus": 2.70871901512146, "eos/L_minus": 2.607337236404419, "eos/grad_norm": 0.1816774159669876, "eos/embed_grad_frac": 0.08400005102157593, "eos/time_s": 0.5958666801452637} {"step": 3600, "timestamp": 1778329610.6730192, "train/loss": 2.410040497779846, "train/z_loss": 0.0013394916895776987, "train/perplexity": 11.134412054902935, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914759.7051414433, "perf/iters_per_sec": 0.9130285764414994, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.095255970954895, "data/tokens_consumed": 7551844352, "data/tokens_consumed_B": 7.551844352, "train/loss_slope": -1.7955221086874628e-05} {"step": 3600, "timestamp": 1778329612.039327, "geo/rankme_last": 426.7889709472656, "geo/layer_0/stable_rank_q_proj": 20.5137996673584, "geo/layer_0/stable_rank_k_proj": 17.15782356262207, "geo/layer_0/stable_rank_o_proj": 45.48188018798828, "geo/layer_0/stable_rank_gate_proj": 130.0298309326172, "geo/layer_0/stable_rank_down_proj": 56.12325668334961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06340811401605606, "geo/layer_0/attn_entropy_mean": 6.257839679718018, "geo/layer_0/attn_entropy_std": 0.42106062173843384, "geo/layer_7/stable_rank_q_proj": 42.43620681762695, "geo/layer_7/stable_rank_k_proj": 39.67793655395508, "geo/layer_7/stable_rank_o_proj": 89.5751724243164, "geo/layer_7/stable_rank_gate_proj": 79.6978988647461, "geo/layer_7/stable_rank_down_proj": 143.25978088378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40098506212234497, "geo/layer_7/attn_entropy_mean": 4.713539123535156, "geo/layer_7/attn_entropy_std": 0.7738149166107178, "geo/layer_14/stable_rank_q_proj": 51.628780364990234, "geo/layer_14/stable_rank_k_proj": 41.53239822387695, "geo/layer_14/stable_rank_o_proj": 42.755313873291016, "geo/layer_14/stable_rank_gate_proj": 71.48704528808594, "geo/layer_14/stable_rank_down_proj": 127.79576110839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37749767303466797, "geo/layer_14/attn_entropy_mean": 5.48763370513916, "geo/layer_14/attn_entropy_std": 0.4152344763278961, "geo/layer_21/stable_rank_q_proj": 39.795623779296875, "geo/layer_21/stable_rank_k_proj": 29.364269256591797, "geo/layer_21/stable_rank_o_proj": 66.52084350585938, "geo/layer_21/stable_rank_gate_proj": 63.12464904785156, "geo/layer_21/stable_rank_down_proj": 49.765628814697266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13646291196346283, "geo/layer_21/attn_entropy_mean": 5.88576078414917, "geo/layer_21/attn_entropy_std": 0.3028813600540161, "geo/layer_27/stable_rank_q_proj": 43.84143829345703, "geo/layer_27/stable_rank_k_proj": 30.901216506958008, "geo/layer_27/stable_rank_o_proj": 110.98971557617188, "geo/layer_27/stable_rank_gate_proj": 74.3517837524414, "geo/layer_27/stable_rank_down_proj": 127.26952362060547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1077549085021019, "geo/layer_27/attn_entropy_mean": 4.33724308013916, "geo/layer_27/attn_entropy_std": 0.6485829949378967, "attnres/final_alpha/block_0": 0.24984630942344666, "attnres/block_norm/0": 1.7785570621490479, "attnres/final_alpha/block_1": 0.004085024353116751, "attnres/block_norm/1": 49464.26171875, "attnres/final_alpha/block_2": 0.00892601814121008, "attnres/block_norm/2": 29595.890625, "attnres/final_alpha/block_3": 0.01053575985133648, "attnres/block_norm/3": 68113.015625, "attnres/final_alpha/block_4": 0.012350216507911682, "attnres/block_norm/4": 16597.09375, "attnres/final_alpha/block_5": 0.6130492687225342, "attnres/block_norm/5": 6900.09423828125, "attnres/final_alpha/block_6": 0.10120736062526703, "attnres/block_norm/6": 45328.1953125, "geo/tier1_time_s": 1.3618481159210205, "geo/step": 3600.0, "geo/rankme_slope": 0.0020935279482886906} {"step": 3610, "timestamp": 1778329622.3912883, "train/loss": 2.329946780204773, "train/z_loss": 0.0013512298581190407, "train/perplexity": 10.277394557654791, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790306.1973753038, "perf/iters_per_sec": 0.8536845194698829, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1713929176330566, "data/tokens_consumed": 7572815872, "data/tokens_consumed_B": 7.572815872, "train/loss_slope": -1.8610420221328166e-05} {"step": 3620, "timestamp": 1778329632.77978, "train/loss": 2.400493335723877, "train/z_loss": 0.0013353240792639554, "train/perplexity": 11.028615848972473, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019964.0475481097, "perf/iters_per_sec": 0.9631939161053227, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038212537765503, "data/tokens_consumed": 7593787392, "data/tokens_consumed_B": 7.593787392, "train/loss_slope": -1.6863334912135006e-05} {"step": 3630, "timestamp": 1778329643.1564384, "train/loss": 2.4441339492797853, "train/z_loss": 0.001335613103583455, "train/perplexity": 11.520567881015154, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022032.3850045034, "perf/iters_per_sec": 0.9641801762602346, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371505498886109, "data/tokens_consumed": 7614758912, "data/tokens_consumed_B": 7.614758912, "train/loss_slope": -1.077036145139018e-05} {"step": 3640, "timestamp": 1778329653.5387657, "train/loss": 2.4215942859649657, "train/z_loss": 0.0013399129849858582, "train/perplexity": 11.263802730218522, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020930.1501940046, "perf/iters_per_sec": 0.9636545897455238, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377162218093872, "data/tokens_consumed": 7635730432, "data/tokens_consumed_B": 7.635730432, "train/loss_slope": -5.659083006727475e-06} {"step": 3650, "timestamp": 1778329663.8989704, "grad/layer_0/attn": 0.003061440773308277, "grad/layer_0/mlp": 0.0034797065891325474, "grad/layer_0/attn_mlp_ratio": 0.8797985137280236, "grad/layer_4/attn": 0.002032880438491702, "grad/layer_4/mlp": 0.00266548921354115, "grad/layer_4/attn_mlp_ratio": 0.7626668875257969, "grad/layer_8/attn": 0.005171424243599176, "grad/layer_8/mlp": 0.0035865805111825466, "grad/layer_8/attn_mlp_ratio": 1.4418815033670982, "grad/layer_12/attn": 0.004429864697158337, "grad/layer_12/mlp": 0.0069122533313930035, "grad/layer_12/attn_mlp_ratio": 0.6408712789724206, "grad/layer_16/attn": 0.003714179154485464, "grad/layer_16/mlp": 0.004194105044007301, "grad/layer_16/attn_mlp_ratio": 0.8855713023295151, "grad/layer_20/attn": 0.004523284267634153, "grad/layer_20/mlp": 0.005571294110268354, "grad/layer_20/attn_mlp_ratio": 0.8118911148683133, "grad/layer_24/attn": 0.009638141840696335, "grad/layer_24/mlp": 0.009296685457229614, "grad/layer_24/attn_mlp_ratio": 1.0367288192511992, "grad/layer_27/attn": 0.007567991968244314, "grad/layer_27/mlp": 0.008889223448932171, "grad/layer_27/attn_mlp_ratio": 0.8513670430926927} {"step": 3650, "timestamp": 1778329663.9144547, "train/loss": 2.387349271774292, "train/z_loss": 0.0013670161599293352, "train/perplexity": 10.884603543820585, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022669.019264937, "perf/iters_per_sec": 0.9644837471317944, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036824107170105, "data/tokens_consumed": 7656701952, "data/tokens_consumed_B": 7.656701952, "train/loss_slope": -5.231751617353793e-06} {"step": 3660, "timestamp": 1778329674.2648852, "train/loss": 2.3719526052474977, "train/z_loss": 0.0013518880703486502, "train/perplexity": 10.718300474497685, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027171.2332712896, "perf/iters_per_sec": 0.9666305700642059, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345213890075684, "data/tokens_consumed": 7677673472, "data/tokens_consumed_B": 7.677673472, "train/loss_slope": -7.186990538195625e-06} {"step": 3670, "timestamp": 1778329684.6266367, "train/loss": 2.3854507923126222, "train/z_loss": 0.0013571346527896822, "train/perplexity": 10.86395895041316, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024889.0258616523, "perf/iters_per_sec": 0.9655423287685644, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356873750686646, "data/tokens_consumed": 7698644992, "data/tokens_consumed_B": 7.698644992, "train/loss_slope": -4.794154036985669e-06} {"step": 3675, "timestamp": 1778329690.3847833, "eos/sharpness": 20.961833000183102, "eos/L0_probe": 2.372399091720581, "eos/L_plus": 2.4561734199523926, "eos/L_minus": 2.4982430934906006, "eos/grad_norm": 0.10309096425771713, "eos/embed_grad_frac": 0.25863486528396606, "eos/time_s": 0.591118574142456} {"step": 3675, "timestamp": 1778329691.7607577, "geo/rankme_last": 426.0293273925781, "geo/layer_0/stable_rank_q_proj": 20.53666877746582, "geo/layer_0/stable_rank_k_proj": 17.158222198486328, "geo/layer_0/stable_rank_o_proj": 45.531158447265625, "geo/layer_0/stable_rank_gate_proj": 130.36956787109375, "geo/layer_0/stable_rank_down_proj": 56.139137268066406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06762911379337311, "geo/layer_0/attn_entropy_mean": 6.256588935852051, "geo/layer_0/attn_entropy_std": 0.42417633533477783, "geo/layer_7/stable_rank_q_proj": 42.40074920654297, "geo/layer_7/stable_rank_k_proj": 39.58396911621094, "geo/layer_7/stable_rank_o_proj": 89.581787109375, "geo/layer_7/stable_rank_gate_proj": 79.62825775146484, "geo/layer_7/stable_rank_down_proj": 143.17738342285156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40177953243255615, "geo/layer_7/attn_entropy_mean": 4.735489368438721, "geo/layer_7/attn_entropy_std": 0.767368733882904, "geo/layer_14/stable_rank_q_proj": 51.636566162109375, "geo/layer_14/stable_rank_k_proj": 41.6090087890625, "geo/layer_14/stable_rank_o_proj": 42.777557373046875, "geo/layer_14/stable_rank_gate_proj": 71.5726089477539, "geo/layer_14/stable_rank_down_proj": 127.74657440185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3768959641456604, "geo/layer_14/attn_entropy_mean": 5.503376483917236, "geo/layer_14/attn_entropy_std": 0.455613374710083, "geo/layer_21/stable_rank_q_proj": 39.76301574707031, "geo/layer_21/stable_rank_k_proj": 29.292600631713867, "geo/layer_21/stable_rank_o_proj": 66.6218032836914, "geo/layer_21/stable_rank_gate_proj": 63.034236907958984, "geo/layer_21/stable_rank_down_proj": 49.76371765136719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13775227963924408, "geo/layer_21/attn_entropy_mean": 5.858665466308594, "geo/layer_21/attn_entropy_std": 0.31753554940223694, "geo/layer_27/stable_rank_q_proj": 43.82763671875, "geo/layer_27/stable_rank_k_proj": 30.920215606689453, "geo/layer_27/stable_rank_o_proj": 110.99453735351562, "geo/layer_27/stable_rank_gate_proj": 74.3380126953125, "geo/layer_27/stable_rank_down_proj": 127.03480529785156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09842252731323242, "geo/layer_27/attn_entropy_mean": 4.34869384765625, "geo/layer_27/attn_entropy_std": 0.6570242643356323, "attnres/final_alpha/block_0": 0.2504452168941498, "attnres/block_norm/0": 1.7785263061523438, "attnres/final_alpha/block_1": 0.00408150115981698, "attnres/block_norm/1": 49547.796875, "attnres/final_alpha/block_2": 0.00883988942950964, "attnres/block_norm/2": 29627.240234375, "attnres/final_alpha/block_3": 0.010592079721391201, "attnres/block_norm/3": 68002.1875, "attnres/final_alpha/block_4": 0.012380782514810562, "attnres/block_norm/4": 16668.01171875, "attnres/final_alpha/block_5": 0.6103589534759521, "attnres/block_norm/5": 6927.3681640625, "attnres/final_alpha/block_6": 0.10330154001712799, "attnres/block_norm/6": 45518.8515625, "geo/tier1_time_s": 1.356745958328247, "geo/step": 3675.0, "geo/rankme_slope": 0.0020712507268532414} {"step": 3680, "timestamp": 1778329696.94261, "train/loss": 2.3828396558761598, "train/z_loss": 0.001357175095472485, "train/perplexity": 10.83562867455767, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703734.3816718238, "perf/iters_per_sec": 0.8124038608893508, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2309148788452149, "data/tokens_consumed": 7719616512, "data/tokens_consumed_B": 7.719616512, "train/loss_slope": -6.234158877313452e-06} {"step": 3690, "timestamp": 1778329707.3006608, "train/loss": 2.4034143447875977, "train/z_loss": 0.001351807825267315, "train/perplexity": 11.060877631368093, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025481.05177633, "perf/iters_per_sec": 0.9658246287233019, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353846549987793, "data/tokens_consumed": 7740588032, "data/tokens_consumed_B": 7.740588032, "train/loss_slope": -1.8470041393481919e-06} {"step": 3700, "timestamp": 1778329717.6444407, "grad/layer_0/attn": 0.003706958144903183, "grad/layer_0/mlp": 0.0036289533600211143, "grad/layer_0/attn_mlp_ratio": 1.0214950910066547, "grad/layer_4/attn": 0.001876820344477892, "grad/layer_4/mlp": 0.002737094648182392, "grad/layer_4/attn_mlp_ratio": 0.6856979816735325, "grad/layer_8/attn": 0.005121803842484951, "grad/layer_8/mlp": 0.0036147336941212416, "grad/layer_8/attn_mlp_ratio": 1.4169242146723795, "grad/layer_12/attn": 0.006156622897833586, "grad/layer_12/mlp": 0.0068040997721254826, "grad/layer_12/attn_mlp_ratio": 0.9048401718874776, "grad/layer_16/attn": 0.0038677456323057413, "grad/layer_16/mlp": 0.004361232277005911, "grad/layer_16/attn_mlp_ratio": 0.8868469501184985, "grad/layer_20/attn": 0.0032475735060870647, "grad/layer_20/mlp": 0.005711892619729042, "grad/layer_20/attn_mlp_ratio": 0.5685634631879294, "grad/layer_24/attn": 0.00964569952338934, "grad/layer_24/mlp": 0.009162968024611473, "grad/layer_24/attn_mlp_ratio": 1.052682863479713, "grad/layer_27/attn": 0.005029213149100542, "grad/layer_27/mlp": 0.010692666284739971, "grad/layer_27/attn_mlp_ratio": 0.47034228583788784} {"step": 3700, "timestamp": 1778329717.6600087, "train/loss": 2.3912095069885253, "train/z_loss": 0.0013611072557978333, "train/perplexity": 10.926701876168858, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025600.9258823511, "perf/iters_per_sec": 0.9658817891513591, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353233814239502, "data/tokens_consumed": 7761559552, "data/tokens_consumed_B": 7.761559552, "train/loss_slope": -1.216192156782737e-06} {"step": 3710, "timestamp": 1778329728.0185204, "train/loss": 2.3697327852249144, "train/z_loss": 0.0013580595026724041, "train/perplexity": 10.694534164720723, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025660.9149055472, "perf/iters_per_sec": 0.9659103941467033, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352927207946778, "data/tokens_consumed": 7782531072, "data/tokens_consumed_B": 7.782531072, "train/loss_slope": -1.6373730621428774e-06} {"step": 3720, "timestamp": 1778329738.960914, "train/loss": 2.352311944961548, "train/z_loss": 0.0013492637081071734, "train/perplexity": 10.509839829584449, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917696.8412814806, "perf/iters_per_sec": 0.9144291120917705, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0935784816741942, "data/tokens_consumed": 7803502592, "data/tokens_consumed_B": 7.803502592, "train/loss_slope": -3.996347687174962e-06} {"step": 3730, "timestamp": 1778329749.3181186, "train/loss": 2.359969472885132, "train/z_loss": 0.0013526760623790324, "train/perplexity": 10.590628146177059, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026225.200157996, "perf/iters_per_sec": 0.966179466322897, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350044012069701, "data/tokens_consumed": 7824474112, "data/tokens_consumed_B": 7.824474112, "train/loss_slope": -6.942552408583993e-06} {"step": 3740, "timestamp": 1778329759.6682086, "train/loss": 2.370597553253174, "train/z_loss": 0.0013527716626413166, "train/perplexity": 10.703786455909718, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027242.3883990783, "perf/iters_per_sec": 0.9666644994731323, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344850778579713, "data/tokens_consumed": 7845445632, "data/tokens_consumed_B": 7.845445632, "train/loss_slope": -9.186425396461085e-06} {"step": 3750, "timestamp": 1778329770.0092022, "grad/layer_0/attn": 0.002416818169876933, "grad/layer_0/mlp": 0.002931485651060939, "grad/layer_0/attn_mlp_ratio": 0.8244345615537304, "grad/layer_4/attn": 0.001931018428876996, "grad/layer_4/mlp": 0.0025557924527674913, "grad/layer_4/attn_mlp_ratio": 0.7555458391120318, "grad/layer_8/attn": 0.004659686703234911, "grad/layer_8/mlp": 0.0034927406813949347, "grad/layer_8/attn_mlp_ratio": 1.3341060773980256, "grad/layer_12/attn": 0.006063829176127911, "grad/layer_12/mlp": 0.006337362807244062, "grad/layer_12/attn_mlp_ratio": 0.9568379253137795, "grad/layer_16/attn": 0.00339974882081151, "grad/layer_16/mlp": 0.004765412770211697, "grad/layer_16/attn_mlp_ratio": 0.7134216726662094, "grad/layer_20/attn": 0.0030862882267683744, "grad/layer_20/mlp": 0.006278600078076124, "grad/layer_20/attn_mlp_ratio": 0.49155673864139393, "grad/layer_24/attn": 0.011325743980705738, "grad/layer_24/mlp": 0.011188783682882786, "grad/layer_24/attn_mlp_ratio": 1.012240847663218, "grad/layer_27/attn": 0.008616049773991108, "grad/layer_27/mlp": 0.010623585432767868, "grad/layer_27/attn_mlp_ratio": 0.811030301155422} {"step": 3750, "timestamp": 1778329770.6047482, "eos/sharpness": 62.888908386230455, "eos/L0_probe": 2.3679845333099365, "eos/L_plus": 2.7469258308410645, "eos/L_minus": 2.6179323196411133, "eos/grad_norm": 0.2020551711320877, "eos/embed_grad_frac": 0.051066458225250244, "eos/time_s": 0.5927290916442871} {"step": 3750, "timestamp": 1778329770.6258645, "train/loss": 2.3525170564651487, "train/z_loss": 0.0013574036303907633, "train/perplexity": 10.511995739727926, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914783.213547379, "perf/iters_per_sec": 0.9130397861229796, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0952425241470336, "data/tokens_consumed": 7866417152, "data/tokens_consumed_B": 7.866417152, "train/loss_slope": -9.047780622063919e-06} {"step": 3750, "timestamp": 1778329771.9879262, "geo/rankme_last": 426.53668212890625, "geo/layer_0/stable_rank_q_proj": 20.56882095336914, "geo/layer_0/stable_rank_k_proj": 17.135169982910156, "geo/layer_0/stable_rank_o_proj": 45.555606842041016, "geo/layer_0/stable_rank_gate_proj": 130.3746337890625, "geo/layer_0/stable_rank_down_proj": 56.15103530883789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06556065380573273, "geo/layer_0/attn_entropy_mean": 6.255680084228516, "geo/layer_0/attn_entropy_std": 0.4258328378200531, "geo/layer_7/stable_rank_q_proj": 42.30950164794922, "geo/layer_7/stable_rank_k_proj": 39.73344421386719, "geo/layer_7/stable_rank_o_proj": 89.64781951904297, "geo/layer_7/stable_rank_gate_proj": 79.62207794189453, "geo/layer_7/stable_rank_down_proj": 143.34637451171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4277307093143463, "geo/layer_7/attn_entropy_mean": 4.750157356262207, "geo/layer_7/attn_entropy_std": 0.7778377532958984, "geo/layer_14/stable_rank_q_proj": 51.66067886352539, "geo/layer_14/stable_rank_k_proj": 41.65627670288086, "geo/layer_14/stable_rank_o_proj": 42.76473617553711, "geo/layer_14/stable_rank_gate_proj": 71.56478118896484, "geo/layer_14/stable_rank_down_proj": 127.60889434814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3631535768508911, "geo/layer_14/attn_entropy_mean": 5.545969009399414, "geo/layer_14/attn_entropy_std": 0.43711620569229126, "geo/layer_21/stable_rank_q_proj": 39.788612365722656, "geo/layer_21/stable_rank_k_proj": 29.26039695739746, "geo/layer_21/stable_rank_o_proj": 66.6781997680664, "geo/layer_21/stable_rank_gate_proj": 63.000980377197266, "geo/layer_21/stable_rank_down_proj": 49.69527816772461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1384512037038803, "geo/layer_21/attn_entropy_mean": 5.893928050994873, "geo/layer_21/attn_entropy_std": 0.3176439106464386, "geo/layer_27/stable_rank_q_proj": 43.813228607177734, "geo/layer_27/stable_rank_k_proj": 30.868228912353516, "geo/layer_27/stable_rank_o_proj": 110.67787170410156, "geo/layer_27/stable_rank_gate_proj": 74.25569915771484, "geo/layer_27/stable_rank_down_proj": 127.49726867675781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09949573874473572, "geo/layer_27/attn_entropy_mean": 4.343235969543457, "geo/layer_27/attn_entropy_std": 0.6473478674888611, "attnres/final_alpha/block_0": 0.24972181022167206, "attnres/block_norm/0": 1.7783949375152588, "attnres/final_alpha/block_1": 0.00409943051636219, "attnres/block_norm/1": 49528.953125, "attnres/final_alpha/block_2": 0.008717847988009453, "attnres/block_norm/2": 29670.970703125, "attnres/final_alpha/block_3": 0.010498682036995888, "attnres/block_norm/3": 67486.265625, "attnres/final_alpha/block_4": 0.01204724982380867, "attnres/block_norm/4": 16665.12890625, "attnres/final_alpha/block_5": 0.612213134765625, "attnres/block_norm/5": 7036.669921875, "attnres/final_alpha/block_6": 0.10270185768604279, "attnres/block_norm/6": 45436.046875, "geo/tier1_time_s": 1.3581812381744385, "geo/step": 3750.0, "geo/rankme_slope": 0.0019162574404761904} {"step": 3760, "timestamp": 1778329782.3377724, "train/loss": 2.404171872138977, "train/z_loss": 0.001352904620580375, "train/perplexity": 11.069259723135163, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791174.1179396673, "perf/iters_per_sec": 0.85409837624534, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1708253145217895, "data/tokens_consumed": 7887388672, "data/tokens_consumed_B": 7.887388672, "train/loss_slope": -8.321707505490179e-06} {"step": 3770, "timestamp": 1778329792.6917133, "train/loss": 2.36772735118866, "train/z_loss": 0.0013618995202705265, "train/perplexity": 10.673108472991386, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026385.6819921713, "perf/iters_per_sec": 0.9662559900246483, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034922432899475, "data/tokens_consumed": 7908360192, "data/tokens_consumed_B": 7.908360192, "train/loss_slope": -1.1759987916096538e-05} {"step": 3780, "timestamp": 1778329803.0492058, "train/loss": 2.3694196462631227, "train/z_loss": 0.0013499136897735299, "train/perplexity": 10.691185813672483, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025614.9198934883, "perf/iters_per_sec": 0.9658884620158616, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353162288665771, "data/tokens_consumed": 7929331712, "data/tokens_consumed_B": 7.929331712, "train/loss_slope": -1.2615729723111088e-05} {"step": 3790, "timestamp": 1778329813.405524, "train/loss": 2.3970205545425416, "train/z_loss": 0.0013543460168875753, "train/perplexity": 10.990382306185294, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026041.830136754, "perf/iters_per_sec": 0.9660920286830683, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350980758666992, "data/tokens_consumed": 7950303232, "data/tokens_consumed_B": 7.950303232, "train/loss_slope": -1.370168394631814e-05} {"step": 3800, "timestamp": 1778329823.748706, "grad/layer_0/attn": 0.0036666279193013906, "grad/layer_0/mlp": 0.003627773839980364, "grad/layer_0/attn_mlp_ratio": 1.0107101434554209, "grad/layer_4/attn": 0.0022108207922428846, "grad/layer_4/mlp": 0.0028288031462579966, "grad/layer_4/attn_mlp_ratio": 0.7815392587545307, "grad/layer_8/attn": 0.004139363300055265, "grad/layer_8/mlp": 0.003602914744988084, "grad/layer_8/attn_mlp_ratio": 1.148892904258734, "grad/layer_12/attn": 0.0059661888517439365, "grad/layer_12/mlp": 0.007483092602342367, "grad/layer_12/attn_mlp_ratio": 0.7972891809660988, "grad/layer_16/attn": 0.00394846498966217, "grad/layer_16/mlp": 0.005443920847028494, "grad/layer_16/attn_mlp_ratio": 0.7252980026863539, "grad/layer_20/attn": 0.002936886390671134, "grad/layer_20/mlp": 0.006863666232675314, "grad/layer_20/attn_mlp_ratio": 0.42788886410310056, "grad/layer_24/attn": 0.017875323072075844, "grad/layer_24/mlp": 0.014331698417663574, "grad/layer_24/attn_mlp_ratio": 1.2472578215376784, "grad/layer_27/attn": 0.008801233023405075, "grad/layer_27/mlp": 0.014852750115096569, "grad/layer_27/attn_mlp_ratio": 0.5925658814661384} {"step": 3800, "timestamp": 1778329823.7642105, "train/loss": 2.374378967285156, "train/z_loss": 0.0013595904223620892, "train/perplexity": 10.744338527975128, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025474.568717447, "perf/iters_per_sec": 0.9658215373599277, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035387969017029, "data/tokens_consumed": 7971274752, "data/tokens_consumed_B": 7.971274752, "train/loss_slope": -1.218248981632624e-05} {"step": 3810, "timestamp": 1778329834.1143684, "train/loss": 2.416298675537109, "train/z_loss": 0.0013567106099799276, "train/perplexity": 11.204311678767267, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027044.8202634493, "perf/iters_per_sec": 0.9665702916447875, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345859050750732, "data/tokens_consumed": 7992246272, "data/tokens_consumed_B": 7.992246272, "train/loss_slope": -1.0417246568177895e-05} {"step": 3820, "timestamp": 1778329844.4628305, "train/loss": 2.3657548427581787, "train/z_loss": 0.0013649108004756272, "train/perplexity": 10.652076426312934, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027572.0658034685, "perf/iters_per_sec": 0.9668217019097655, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343168735504151, "data/tokens_consumed": 8013217792, "data/tokens_consumed_B": 8.013217792, "train/loss_slope": -1.0823405801158985e-05} {"step": 3825, "timestamp": 1778329850.2160394, "eos/sharpness": 60.510230064392076, "eos/L0_probe": 2.368760585784912, "eos/L_plus": 2.6240804195404053, "eos/L_minus": 2.71854305267334, "eos/grad_norm": 0.17516928911209106, "eos/embed_grad_frac": 0.08129166066646576, "eos/time_s": 0.5904059410095215} {"step": 3825, "timestamp": 1778329851.5919328, "geo/rankme_last": 425.9407653808594, "geo/layer_0/stable_rank_q_proj": 20.560720443725586, "geo/layer_0/stable_rank_k_proj": 17.13611602783203, "geo/layer_0/stable_rank_o_proj": 45.589805603027344, "geo/layer_0/stable_rank_gate_proj": 130.18585205078125, "geo/layer_0/stable_rank_down_proj": 56.26053237915039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06648528575897217, "geo/layer_0/attn_entropy_mean": 6.258199691772461, "geo/layer_0/attn_entropy_std": 0.42838242650032043, "geo/layer_7/stable_rank_q_proj": 42.2879638671875, "geo/layer_7/stable_rank_k_proj": 39.81559753417969, "geo/layer_7/stable_rank_o_proj": 89.71583557128906, "geo/layer_7/stable_rank_gate_proj": 79.65472412109375, "geo/layer_7/stable_rank_down_proj": 142.99574279785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4117141366004944, "geo/layer_7/attn_entropy_mean": 4.76995849609375, "geo/layer_7/attn_entropy_std": 0.7737919092178345, "geo/layer_14/stable_rank_q_proj": 51.67616653442383, "geo/layer_14/stable_rank_k_proj": 41.67313766479492, "geo/layer_14/stable_rank_o_proj": 42.75142288208008, "geo/layer_14/stable_rank_gate_proj": 71.55242156982422, "geo/layer_14/stable_rank_down_proj": 127.70425415039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37739625573158264, "geo/layer_14/attn_entropy_mean": 5.528609752655029, "geo/layer_14/attn_entropy_std": 0.4586050808429718, "geo/layer_21/stable_rank_q_proj": 39.79609298706055, "geo/layer_21/stable_rank_k_proj": 29.127975463867188, "geo/layer_21/stable_rank_o_proj": 66.61347198486328, "geo/layer_21/stable_rank_gate_proj": 62.98433303833008, "geo/layer_21/stable_rank_down_proj": 49.76195526123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13507972657680511, "geo/layer_21/attn_entropy_mean": 5.869512557983398, "geo/layer_21/attn_entropy_std": 0.31998971104621887, "geo/layer_27/stable_rank_q_proj": 43.780269622802734, "geo/layer_27/stable_rank_k_proj": 30.799175262451172, "geo/layer_27/stable_rank_o_proj": 110.92807006835938, "geo/layer_27/stable_rank_gate_proj": 74.13973999023438, "geo/layer_27/stable_rank_down_proj": 127.412109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10107658803462982, "geo/layer_27/attn_entropy_mean": 4.315916538238525, "geo/layer_27/attn_entropy_std": 0.6609888672828674, "attnres/final_alpha/block_0": 0.2504761815071106, "attnres/block_norm/0": 1.7784802913665771, "attnres/final_alpha/block_1": 0.004051554948091507, "attnres/block_norm/1": 49701.4375, "attnres/final_alpha/block_2": 0.008832676336169243, "attnres/block_norm/2": 29731.046875, "attnres/final_alpha/block_3": 0.010500933043658733, "attnres/block_norm/3": 67793.140625, "attnres/final_alpha/block_4": 0.012518897652626038, "attnres/block_norm/4": 16723.41796875, "attnres/final_alpha/block_5": 0.609900176525116, "attnres/block_norm/5": 6943.4599609375, "attnres/final_alpha/block_6": 0.1037195697426796, "attnres/block_norm/6": 45453.5234375, "geo/tier1_time_s": 1.356419563293457, "geo/step": 3825.0, "geo/rankme_slope": 0.0018845925479566827} {"step": 3830, "timestamp": 1778329856.7660325, "train/loss": 2.3276203155517576, "train/z_loss": 0.0013623948441818356, "train/perplexity": 10.253512353814596, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705117.5437386327, "perf/iters_per_sec": 0.8130634039586223, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2299163818359375, "data/tokens_consumed": 8034189312, "data/tokens_consumed_B": 8.034189312, "train/loss_slope": -1.1315236576605627e-05} {"step": 3840, "timestamp": 1778329867.1143413, "train/loss": 2.3553698539733885, "train/z_loss": 0.0013555529876612127, "train/perplexity": 10.54202715138158, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027355.0414314985, "perf/iters_per_sec": 0.9667182166249745, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344275951385498, "data/tokens_consumed": 8055160832, "data/tokens_consumed_B": 8.055160832, "train/loss_slope": -1.3484136785718345e-05} {"step": 3850, "timestamp": 1778329877.4480126, "grad/layer_0/attn": 0.0030265720561146736, "grad/layer_0/mlp": 0.003209832590073347, "grad/layer_0/attn_mlp_ratio": 0.942906484027837, "grad/layer_4/attn": 0.00205142330378294, "grad/layer_4/mlp": 0.0027053947560489178, "grad/layer_4/attn_mlp_ratio": 0.7582713108203851, "grad/layer_8/attn": 0.003277993993833661, "grad/layer_8/mlp": 0.003606455633416772, "grad/layer_8/attn_mlp_ratio": 0.9089239508641009, "grad/layer_12/attn": 0.008788051083683968, "grad/layer_12/mlp": 0.007016700226813555, "grad/layer_12/attn_mlp_ratio": 1.2524478279486142, "grad/layer_16/attn": 0.0036004141438752413, "grad/layer_16/mlp": 0.004077334422618151, "grad/layer_16/attn_mlp_ratio": 0.8830313342951641, "grad/layer_20/attn": 0.004337613936513662, "grad/layer_20/mlp": 0.005630316212773323, "grad/layer_20/attn_mlp_ratio": 0.7704032412305243, "grad/layer_24/attn": 0.00674829725176096, "grad/layer_24/mlp": 0.007803584448993206, "grad/layer_24/attn_mlp_ratio": 0.8647689032383975, "grad/layer_27/attn": 0.00839185155928135, "grad/layer_27/mlp": 0.008013233542442322, "grad/layer_27/attn_mlp_ratio": 1.0472490799261944} {"step": 3850, "timestamp": 1778329877.4634962, "train/loss": 2.385151410102844, "train/z_loss": 0.0013487313059158622, "train/perplexity": 10.860706961193795, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027546.828024126, "perf/iters_per_sec": 0.9668096675987844, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343297481536866, "data/tokens_consumed": 8076132352, "data/tokens_consumed_B": 8.076132352, "train/loss_slope": -1.4933469035837883e-05} {"step": 3860, "timestamp": 1778329887.8095672, "train/loss": 2.371138405799866, "train/z_loss": 0.001351919851731509, "train/perplexity": 10.709577191899637, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027946.967536177, "perf/iters_per_sec": 0.9670004689865956, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341256618499757, "data/tokens_consumed": 8097103872, "data/tokens_consumed_B": 8.097103872, "train/loss_slope": -1.3583832090408107e-05} {"step": 3870, "timestamp": 1778329898.1567268, "train/loss": 2.386218857765198, "train/z_loss": 0.0013471942394971848, "train/perplexity": 10.872306387239732, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027720.794515124, "perf/iters_per_sec": 0.9668926212859745, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034241008758545, "data/tokens_consumed": 8118075392, "data/tokens_consumed_B": 8.118075392, "train/loss_slope": -1.1378368729054253e-05} {"step": 3880, "timestamp": 1778329908.501776, "train/loss": 2.3903743267059325, "train/z_loss": 0.001349736948031932, "train/perplexity": 10.91757991997732, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028492.9264570703, "perf/iters_per_sec": 0.96726080248693, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338473320007324, "data/tokens_consumed": 8139046912, "data/tokens_consumed_B": 8.139046912, "train/loss_slope": -1.1640282861828006e-05} {"step": 3890, "timestamp": 1778329918.8437052, "train/loss": 2.346188449859619, "train/z_loss": 0.0013686643098481, "train/perplexity": 10.445679520022432, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028597.905475902, "perf/iters_per_sec": 0.9673108603839407, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033793830871582, "data/tokens_consumed": 8160018432, "data/tokens_consumed_B": 8.160018432, "train/loss_slope": -1.2982547901930662e-05} {"step": 3900, "timestamp": 1778329929.1908088, "grad/layer_0/attn": 0.002707837847992778, "grad/layer_0/mlp": 0.0030175705906003714, "grad/layer_0/attn_mlp_ratio": 0.8973568892445831, "grad/layer_4/attn": 0.005278108641505241, "grad/layer_4/mlp": 0.002641449449583888, "grad/layer_4/attn_mlp_ratio": 1.9981864284845836, "grad/layer_8/attn": 0.005992002319544554, "grad/layer_8/mlp": 0.003387880278751254, "grad/layer_8/attn_mlp_ratio": 1.7686581725630934, "grad/layer_12/attn": 0.004729726817458868, "grad/layer_12/mlp": 0.006030003074556589, "grad/layer_12/attn_mlp_ratio": 0.7843655601071329, "grad/layer_16/attn": 0.004525217693299055, "grad/layer_16/mlp": 0.004454709589481354, "grad/layer_16/attn_mlp_ratio": 1.0158277438334957, "grad/layer_20/attn": 0.004421417601406574, "grad/layer_20/mlp": 0.005994521081447601, "grad/layer_20/attn_mlp_ratio": 0.7375764414829973, "grad/layer_24/attn": 0.015879767015576363, "grad/layer_24/mlp": 0.013328871689736843, "grad/layer_24/attn_mlp_ratio": 1.1913811810991906, "grad/layer_27/attn": 0.007677131332457066, "grad/layer_27/mlp": 0.01413825061172247, "grad/layer_27/attn_mlp_ratio": 0.5430043283991077} {"step": 3900, "timestamp": 1778329929.7882438, "eos/sharpness": 62.81771659851073, "eos/L0_probe": 2.3677568435668945, "eos/L_plus": 2.6578240394592285, "eos/L_minus": 2.705866813659668, "eos/grad_norm": 0.22756657004356384, "eos/embed_grad_frac": 0.05069294944405556, "eos/time_s": 0.5945146083831787} {"step": 3900, "timestamp": 1778329929.808714, "train/loss": 2.3655974864959717, "train/z_loss": 0.0013449237332679332, "train/perplexity": 10.650400387252825, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914110.6177130437, "perf/iters_per_sec": 0.9127190674367159, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0956273794174194, "data/tokens_consumed": 8180989952, "data/tokens_consumed_B": 8.180989952, "train/loss_slope": -1.299881988053842e-05} {"step": 3900, "timestamp": 1778329931.1707213, "geo/rankme_last": 426.2046813964844, "geo/layer_0/stable_rank_q_proj": 20.584630966186523, "geo/layer_0/stable_rank_k_proj": 17.19144630432129, "geo/layer_0/stable_rank_o_proj": 45.610198974609375, "geo/layer_0/stable_rank_gate_proj": 130.22445678710938, "geo/layer_0/stable_rank_down_proj": 56.27839279174805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06534980237483978, "geo/layer_0/attn_entropy_mean": 6.255283832550049, "geo/layer_0/attn_entropy_std": 0.42534011602401733, "geo/layer_7/stable_rank_q_proj": 42.37264633178711, "geo/layer_7/stable_rank_k_proj": 39.72747802734375, "geo/layer_7/stable_rank_o_proj": 89.68293762207031, "geo/layer_7/stable_rank_gate_proj": 79.69542694091797, "geo/layer_7/stable_rank_down_proj": 142.5816650390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42220866680145264, "geo/layer_7/attn_entropy_mean": 4.727425575256348, "geo/layer_7/attn_entropy_std": 0.770353376865387, "geo/layer_14/stable_rank_q_proj": 51.61497116088867, "geo/layer_14/stable_rank_k_proj": 41.78105545043945, "geo/layer_14/stable_rank_o_proj": 42.77012634277344, "geo/layer_14/stable_rank_gate_proj": 71.55204010009766, "geo/layer_14/stable_rank_down_proj": 127.51956939697266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3746911287307739, "geo/layer_14/attn_entropy_mean": 5.522218704223633, "geo/layer_14/attn_entropy_std": 0.4249015748500824, "geo/layer_21/stable_rank_q_proj": 39.734588623046875, "geo/layer_21/stable_rank_k_proj": 29.071666717529297, "geo/layer_21/stable_rank_o_proj": 66.69056701660156, "geo/layer_21/stable_rank_gate_proj": 62.88473892211914, "geo/layer_21/stable_rank_down_proj": 49.72046661376953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13424533605575562, "geo/layer_21/attn_entropy_mean": 5.85642147064209, "geo/layer_21/attn_entropy_std": 0.3168851137161255, "geo/layer_27/stable_rank_q_proj": 43.65959167480469, "geo/layer_27/stable_rank_k_proj": 30.800739288330078, "geo/layer_27/stable_rank_o_proj": 110.76024627685547, "geo/layer_27/stable_rank_gate_proj": 74.05123138427734, "geo/layer_27/stable_rank_down_proj": 127.26657104492188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09393323212862015, "geo/layer_27/attn_entropy_mean": 4.315191745758057, "geo/layer_27/attn_entropy_std": 0.6393570303916931, "attnres/final_alpha/block_0": 0.2516968846321106, "attnres/block_norm/0": 1.7784984111785889, "attnres/final_alpha/block_1": 0.004042484797537327, "attnres/block_norm/1": 49750.8671875, "attnres/final_alpha/block_2": 0.008793319575488567, "attnres/block_norm/2": 29682.2734375, "attnres/final_alpha/block_3": 0.010547514073550701, "attnres/block_norm/3": 68413.0390625, "attnres/final_alpha/block_4": 0.012314332649111748, "attnres/block_norm/4": 16663.34375, "attnres/final_alpha/block_5": 0.6087731122970581, "attnres/block_norm/5": 6930.68896484375, "attnres/final_alpha/block_6": 0.10383231937885284, "attnres/block_norm/6": 45406.3046875, "geo/tier1_time_s": 1.3580241203308105, "geo/step": 3900.0, "geo/rankme_slope": 0.0018550115358643457} {"step": 3910, "timestamp": 1778329941.5214279, "train/loss": 2.412826108932495, "train/z_loss": 0.0013424850068986416, "train/perplexity": 11.165471436897594, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791069.5889777, "perf/iters_per_sec": 0.8540485329521657, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1708936452865601, "data/tokens_consumed": 8201961472, "data/tokens_consumed_B": 8.201961472, "train/loss_slope": -1.320726471622531e-05} {"step": 3920, "timestamp": 1778329951.8662765, "train/loss": 2.3581591606140138, "train/z_loss": 0.0013456511194817721, "train/perplexity": 10.571473145582779, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028181.7022702252, "perf/iters_per_sec": 0.9671123992301107, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340059757232667, "data/tokens_consumed": 8222932992, "data/tokens_consumed_B": 8.222932992, "train/loss_slope": -1.294507971762748e-05} {"step": 3930, "timestamp": 1778329962.216871, "train/loss": 2.379376530647278, "train/z_loss": 0.0013528096489608288, "train/perplexity": 10.798168437708783, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027515.7022948763, "perf/iters_per_sec": 0.966794825694502, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343456268310547, "data/tokens_consumed": 8243904512, "data/tokens_consumed_B": 8.243904512, "train/loss_slope": -1.4796439570562233e-05} {"step": 3940, "timestamp": 1778329972.5627112, "train/loss": 2.3724210262298584, "train/z_loss": 0.001358888007234782, "train/perplexity": 10.723322327414, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028422.291610524, "perf/iters_per_sec": 0.9672271211674328, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338833332061768, "data/tokens_consumed": 8264876032, "data/tokens_consumed_B": 8.264876032, "train/loss_slope": -1.4786374944485975e-05} {"step": 3950, "timestamp": 1778329982.8957036, "grad/layer_0/attn": 0.0028867130167782307, "grad/layer_0/mlp": 0.003077663481235504, "grad/layer_0/attn_mlp_ratio": 0.9379559983029017, "grad/layer_4/attn": 0.002719044452533126, "grad/layer_4/mlp": 0.002694424707442522, "grad/layer_4/attn_mlp_ratio": 1.0091372544606179, "grad/layer_8/attn": 0.004417541436851025, "grad/layer_8/mlp": 0.003567707957699895, "grad/layer_8/attn_mlp_ratio": 1.2382014910993075, "grad/layer_12/attn": 0.005972851067781448, "grad/layer_12/mlp": 0.006943633314222097, "grad/layer_12/attn_mlp_ratio": 0.8601910140514804, "grad/layer_16/attn": 0.005536792334169149, "grad/layer_16/mlp": 0.004645572043955326, "grad/layer_16/attn_mlp_ratio": 1.1918429340018866, "grad/layer_20/attn": 0.004312895704060793, "grad/layer_20/mlp": 0.006587059702724218, "grad/layer_20/attn_mlp_ratio": 0.6547527779051139, "grad/layer_24/attn": 0.016735317185521126, "grad/layer_24/mlp": 0.013625538907945156, "grad/layer_24/attn_mlp_ratio": 1.2282315712987673, "grad/layer_27/attn": 0.0071942671202123165, "grad/layer_27/mlp": 0.015178442932665348, "grad/layer_27/attn_mlp_ratio": 0.4739792549690123} {"step": 3950, "timestamp": 1778329982.9115775, "train/loss": 2.383018159866333, "train/z_loss": 0.0013415412860922516, "train/perplexity": 10.837563050153857, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027438.6865802864, "perf/iters_per_sec": 0.96675810174002, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343849182128906, "data/tokens_consumed": 8285847552, "data/tokens_consumed_B": 8.285847552, "train/loss_slope": -1.2917426281278587e-05} {"step": 3960, "timestamp": 1778329993.2566094, "train/loss": 2.378450798988342, "train/z_loss": 0.0013547223061323166, "train/perplexity": 10.788176856802409, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028636.409862602, "perf/iters_per_sec": 0.967329220706273, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033774209022522, "data/tokens_consumed": 8306819072, "data/tokens_consumed_B": 8.306819072, "train/loss_slope": -1.1255733851659665e-05} {"step": 3970, "timestamp": 1778330003.6025834, "train/loss": 2.360581469535828, "train/z_loss": 0.0013513460638932885, "train/perplexity": 10.597111558842345, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028297.6401246344, "perf/iters_per_sec": 0.9671676827071354, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339468717575073, "data/tokens_consumed": 8327790592, "data/tokens_consumed_B": 8.327790592, "train/loss_slope": -1.0708653959039047e-05} {"step": 3975, "timestamp": 1778330009.3601084, "eos/sharpness": 31.967687606811516, "eos/L0_probe": 2.3726606369018555, "eos/L_plus": 2.579450845718384, "eos/L_minus": 2.4855473041534424, "eos/grad_norm": 0.10182774066925049, "eos/embed_grad_frac": 0.21951846778392792, "eos/time_s": 0.5957045555114746} {"step": 3975, "timestamp": 1778330010.7384458, "geo/rankme_last": 426.6614074707031, "geo/layer_0/stable_rank_q_proj": 20.574748992919922, "geo/layer_0/stable_rank_k_proj": 17.210905075073242, "geo/layer_0/stable_rank_o_proj": 45.68217086791992, "geo/layer_0/stable_rank_gate_proj": 130.28443908691406, "geo/layer_0/stable_rank_down_proj": 56.290977478027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06471075862646103, "geo/layer_0/attn_entropy_mean": 6.2577314376831055, "geo/layer_0/attn_entropy_std": 0.42492440342903137, "geo/layer_7/stable_rank_q_proj": 42.509700775146484, "geo/layer_7/stable_rank_k_proj": 39.809017181396484, "geo/layer_7/stable_rank_o_proj": 89.80004119873047, "geo/layer_7/stable_rank_gate_proj": 79.65769958496094, "geo/layer_7/stable_rank_down_proj": 142.751708984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4201052784919739, "geo/layer_7/attn_entropy_mean": 4.729519844055176, "geo/layer_7/attn_entropy_std": 0.7555742859840393, "geo/layer_14/stable_rank_q_proj": 51.5505256652832, "geo/layer_14/stable_rank_k_proj": 41.71037292480469, "geo/layer_14/stable_rank_o_proj": 42.715354919433594, "geo/layer_14/stable_rank_gate_proj": 71.53242492675781, "geo/layer_14/stable_rank_down_proj": 127.40243530273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3735831081867218, "geo/layer_14/attn_entropy_mean": 5.503787040710449, "geo/layer_14/attn_entropy_std": 0.444280207157135, "geo/layer_21/stable_rank_q_proj": 39.71950912475586, "geo/layer_21/stable_rank_k_proj": 29.137725830078125, "geo/layer_21/stable_rank_o_proj": 66.60968780517578, "geo/layer_21/stable_rank_gate_proj": 62.79995346069336, "geo/layer_21/stable_rank_down_proj": 49.73649597167969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14301848411560059, "geo/layer_21/attn_entropy_mean": 5.866718769073486, "geo/layer_21/attn_entropy_std": 0.3189191520214081, "geo/layer_27/stable_rank_q_proj": 43.763580322265625, "geo/layer_27/stable_rank_k_proj": 30.860488891601562, "geo/layer_27/stable_rank_o_proj": 110.55575561523438, "geo/layer_27/stable_rank_gate_proj": 74.00033569335938, "geo/layer_27/stable_rank_down_proj": 127.31712341308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10358661413192749, "geo/layer_27/attn_entropy_mean": 4.293414115905762, "geo/layer_27/attn_entropy_std": 0.6273627877235413, "attnres/final_alpha/block_0": 0.24894508719444275, "attnres/block_norm/0": 1.7783775329589844, "attnres/final_alpha/block_1": 0.004027609247714281, "attnres/block_norm/1": 49670.28125, "attnres/final_alpha/block_2": 0.008719421923160553, "attnres/block_norm/2": 29662.11328125, "attnres/final_alpha/block_3": 0.010442429222166538, "attnres/block_norm/3": 68577.90625, "attnres/final_alpha/block_4": 0.011905347928404808, "attnres/block_norm/4": 16684.90234375, "attnres/final_alpha/block_5": 0.6139184236526489, "attnres/block_norm/5": 7005.931640625, "attnres/final_alpha/block_6": 0.10204166918992996, "attnres/block_norm/6": 45618.1875, "geo/tier1_time_s": 1.357114553451538, "geo/step": 3975.0, "geo/rankme_slope": 0.0018173852353441378} {"step": 3980, "timestamp": 1778330015.9153948, "train/loss": 2.401963448524475, "train/z_loss": 0.0013455305714160204, "train/perplexity": 11.044841081843987, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703931.8765724786, "perf/iters_per_sec": 0.8124980337965386, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2307722091674804, "data/tokens_consumed": 8348762112, "data/tokens_consumed_B": 8.348762112, "train/loss_slope": -8.912235661165257e-06} {"step": 3990, "timestamp": 1778330026.2716517, "train/loss": 2.4044673204422, "train/z_loss": 0.001352167292498052, "train/perplexity": 11.07253060030206, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026169.704938817, "perf/iters_per_sec": 0.9661530041402898, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350327491760254, "data/tokens_consumed": 8369733632, "data/tokens_consumed_B": 8.369733632, "train/loss_slope": -7.381001171177236e-06} {"step": 4000, "timestamp": 1778330036.6082387, "grad/layer_0/attn": 0.003252916969358921, "grad/layer_0/mlp": 0.0032432887237519026, "grad/layer_0/attn_mlp_ratio": 1.0029686365076425, "grad/layer_4/attn": 0.0024297793861478567, "grad/layer_4/mlp": 0.002794487401843071, "grad/layer_4/attn_mlp_ratio": 0.8694901603765716, "grad/layer_8/attn": 0.003786175511777401, "grad/layer_8/mlp": 0.0035949547309428453, "grad/layer_8/attn_mlp_ratio": 1.0531914001223777, "grad/layer_12/attn": 0.006631779950112104, "grad/layer_12/mlp": 0.007687493227422237, "grad/layer_12/attn_mlp_ratio": 0.862671311395579, "grad/layer_16/attn": 0.003907604608684778, "grad/layer_16/mlp": 0.005017619114369154, "grad/layer_16/attn_mlp_ratio": 0.7787766352405573, "grad/layer_20/attn": 0.003947108052670956, "grad/layer_20/mlp": 0.006243712734431028, "grad/layer_20/attn_mlp_ratio": 0.6321732208606042, "grad/layer_24/attn": 0.009251623414456844, "grad/layer_24/mlp": 0.01007060706615448, "grad/layer_24/attn_mlp_ratio": 0.9186758317363333, "grad/layer_27/attn": 0.012650180608034134, "grad/layer_27/mlp": 0.010327663272619247, "grad/layer_27/attn_mlp_ratio": 1.2248831271526874} {"step": 4000, "timestamp": 1778330036.6237311, "train/loss": 2.3812267065048216, "train/z_loss": 0.001349336712155491, "train/perplexity": 10.818165441540383, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026768.691180723, "perf/iters_per_sec": 0.966438623037683, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034726858139038, "data/tokens_consumed": 8390705152, "data/tokens_consumed_B": 8.390705152, "train/loss_slope": -6.299369684969531e-06} {"step": 4000, "timestamp": 1778330043.5146196, "geo/ww_alpha_mean": 7.528225855100126, "geo/ww_alpha_std": 4.455648429199882, "geo/ww_alpha_min": 1.3348572719797398, "geo/ww_alpha_max": 31.10838210868541, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.935624899607266, "geo/ww_alpha_by_type/k_proj": 4.464451994219485, "geo/ww_alpha_by_type/v_proj": 8.807167964726236, "geo/ww_alpha_by_type/o_proj": 7.922346983508576, "geo/ww_alpha_by_type/gate_proj": 7.831626234810945, "geo/ww_alpha_by_type/up_proj": 11.736880858924085, "geo/ww_alpha_by_type/down_proj": 8.090044662838212, "geo/twonn_id/layer_0": 0.733705461025238, "geo/twonn_id/layer_7": 3.742741823196411, "geo/twonn_id/layer_14": 5.443508625030518, "geo/twonn_id/layer_21": 8.046317100524902, "geo/twonn_id/layer_27": 5.5664591789245605, "geo/tier2_time_s": 6.8846330642700195} {"step": 4000, "timestamp": 1778330044.269063, "eoc/jacobian_sigma/layer_0/attn": 1451.3048095703125, "eoc/jacobian_sigma/layer_0/mlp": 11239.4072265625, "eoc/jacobian_sigma/layer_0": 11239.4072265625, "eoc/jacobian_sigma/layer_7/attn": 1.102490782737732, "eoc/jacobian_sigma/layer_7/mlp": 1.8086272478103638, "eoc/jacobian_sigma/layer_7": 1.8086272478103638, "eoc/jacobian_sigma/layer_14/attn": 1.925520420074463, "eoc/jacobian_sigma/layer_14/mlp": 16.64449119567871, "eoc/jacobian_sigma/layer_14": 16.64449119567871, "eoc/jacobian_sigma/layer_21/attn": 1.0620713233947754, "eoc/jacobian_sigma/layer_21/mlp": 4.543246269226074, "eoc/jacobian_sigma/layer_21": 4.543246269226074, "eoc/jacobian_sigma/layer_27/attn": 3.358844041824341, "eoc/jacobian_sigma/layer_27/mlp": 31.263776779174805, "eoc/jacobian_sigma/layer_27": 31.263776779174805, "eoc/layer0_sigma": 11239.4072265625, "eoc/sigma_max": 31.263776779174805, "eoc/sigma_min": 1.8086272478103638, "eoc/sigma_mean": 13.565035372972488, "eoc/time_s": 0.7460763454437256} {"step": 4010, "timestamp": 1778330054.6458855, "train/loss": 2.379233169555664, "train/z_loss": 0.0013587554800324141, "train/perplexity": 10.796620511452973, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163984.064941896, "perf/iters_per_sec": 0.5550308537206153, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8017016410827638, "data/tokens_consumed": 8411676672, "data/tokens_consumed_B": 8.411676672, "train/loss_slope": -7.728733788944327e-06} {"step": 4020, "timestamp": 1778330065.0002885, "train/loss": 2.346146297454834, "train/z_loss": 0.0013506161398254336, "train/perplexity": 10.445239218790993, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026669.9251119283, "perf/iters_per_sec": 0.966391527706112, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034777283668518, "data/tokens_consumed": 8432648192, "data/tokens_consumed_B": 8.432648192, "train/loss_slope": -1.2213315714811165e-05} {"step": 4030, "timestamp": 1778330075.3425128, "train/loss": 2.368977403640747, "train/z_loss": 0.0013578221783973276, "train/perplexity": 10.686458760951078, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028563.4726573015, "perf/iters_per_sec": 0.9672944415365703, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033811378479004, "data/tokens_consumed": 8453619712, "data/tokens_consumed_B": 8.453619712, "train/loss_slope": -1.3027102905030728e-05} {"step": 4040, "timestamp": 1778330085.705166, "train/loss": 2.376288890838623, "train/z_loss": 0.0013539856998249888, "train/perplexity": 10.764879002320031, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025620.704141236, "perf/iters_per_sec": 0.96589122016012, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353132724761962, "data/tokens_consumed": 8474591232, "data/tokens_consumed_B": 8.474591232, "train/loss_slope": -1.0825209527483431e-05} {"step": 4050, "timestamp": 1778330096.0387788, "grad/layer_0/attn": 0.004433070309460163, "grad/layer_0/mlp": 0.0038580649998039007, "grad/layer_0/attn_mlp_ratio": 1.1490397893196487, "grad/layer_4/attn": 0.002451722975820303, "grad/layer_4/mlp": 0.002614883240312338, "grad/layer_4/attn_mlp_ratio": 0.9376031955320248, "grad/layer_8/attn": 0.00452062301337719, "grad/layer_8/mlp": 0.003594090696424246, "grad/layer_8/attn_mlp_ratio": 1.2577932137593042, "grad/layer_12/attn": 0.006363104097545147, "grad/layer_12/mlp": 0.0069479383528232574, "grad/layer_12/attn_mlp_ratio": 0.9158262038086324, "grad/layer_16/attn": 0.003689239267259836, "grad/layer_16/mlp": 0.004637368954718113, "grad/layer_16/attn_mlp_ratio": 0.7955457553041551, "grad/layer_20/attn": 0.0030051860958337784, "grad/layer_20/mlp": 0.006650195457041264, "grad/layer_20/attn_mlp_ratio": 0.4518943946921788, "grad/layer_24/attn": 0.014501291327178478, "grad/layer_24/mlp": 0.011878438293933868, "grad/layer_24/attn_mlp_ratio": 1.2208078912615365, "grad/layer_27/attn": 0.0047164312563836575, "grad/layer_27/mlp": 0.012798425741493702, "grad/layer_27/attn_mlp_ratio": 0.3685165124833199} {"step": 4050, "timestamp": 1778330096.6560848, "eos/sharpness": 64.47739601135252, "eos/L0_probe": 2.36336088180542, "eos/L_plus": 2.7451975345611572, "eos/L_minus": 2.626298189163208, "eos/grad_norm": 0.21392668783664703, "eos/embed_grad_frac": 0.06456287205219269, "eos/time_s": 0.6144108772277832} {"step": 4050, "timestamp": 1778330096.6753235, "train/loss": 2.3579341888427736, "train/z_loss": 0.0013533588848076762, "train/perplexity": 10.569095130047812, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912536.9783044152, "perf/iters_per_sec": 0.9119686976930691, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0965288639068604, "data/tokens_consumed": 8495562752, "data/tokens_consumed_B": 8.495562752, "train/loss_slope": -9.555730410534863e-06} {"step": 4050, "timestamp": 1778330098.036575, "geo/rankme_last": 426.4488220214844, "geo/layer_0/stable_rank_q_proj": 20.539655685424805, "geo/layer_0/stable_rank_k_proj": 17.1903133392334, "geo/layer_0/stable_rank_o_proj": 45.75897979736328, "geo/layer_0/stable_rank_gate_proj": 130.38894653320312, "geo/layer_0/stable_rank_down_proj": 56.39560317993164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06396596133708954, "geo/layer_0/attn_entropy_mean": 6.258613109588623, "geo/layer_0/attn_entropy_std": 0.42288053035736084, "geo/layer_7/stable_rank_q_proj": 42.5463981628418, "geo/layer_7/stable_rank_k_proj": 39.68958282470703, "geo/layer_7/stable_rank_o_proj": 89.84841918945312, "geo/layer_7/stable_rank_gate_proj": 79.6084213256836, "geo/layer_7/stable_rank_down_proj": 142.8238067626953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4137195646762848, "geo/layer_7/attn_entropy_mean": 4.710995197296143, "geo/layer_7/attn_entropy_std": 0.7500614523887634, "geo/layer_14/stable_rank_q_proj": 51.70132064819336, "geo/layer_14/stable_rank_k_proj": 41.69554138183594, "geo/layer_14/stable_rank_o_proj": 42.6873779296875, "geo/layer_14/stable_rank_gate_proj": 71.70586395263672, "geo/layer_14/stable_rank_down_proj": 127.50828552246094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37499168515205383, "geo/layer_14/attn_entropy_mean": 5.516992568969727, "geo/layer_14/attn_entropy_std": 0.44049257040023804, "geo/layer_21/stable_rank_q_proj": 39.64788818359375, "geo/layer_21/stable_rank_k_proj": 29.23457908630371, "geo/layer_21/stable_rank_o_proj": 66.57228088378906, "geo/layer_21/stable_rank_gate_proj": 62.82258605957031, "geo/layer_21/stable_rank_down_proj": 49.74128341674805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13797755539417267, "geo/layer_21/attn_entropy_mean": 5.8658342361450195, "geo/layer_21/attn_entropy_std": 0.3147546947002411, "geo/layer_27/stable_rank_q_proj": 43.796043395996094, "geo/layer_27/stable_rank_k_proj": 30.817554473876953, "geo/layer_27/stable_rank_o_proj": 110.40531158447266, "geo/layer_27/stable_rank_gate_proj": 73.90018463134766, "geo/layer_27/stable_rank_down_proj": 127.20219421386719, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10647791624069214, "geo/layer_27/attn_entropy_mean": 4.336068153381348, "geo/layer_27/attn_entropy_std": 0.6530513763427734, "attnres/final_alpha/block_0": 0.25024867057800293, "attnres/block_norm/0": 1.7783114910125732, "attnres/final_alpha/block_1": 0.00407577445730567, "attnres/block_norm/1": 49572.6484375, "attnres/final_alpha/block_2": 0.008712511509656906, "attnres/block_norm/2": 29696.8203125, "attnres/final_alpha/block_3": 0.010520128533244133, "attnres/block_norm/3": 67904.046875, "attnres/final_alpha/block_4": 0.012065084651112556, "attnres/block_norm/4": 16706.21875, "attnres/final_alpha/block_5": 0.611956775188446, "attnres/block_norm/5": 6994.06640625, "attnres/final_alpha/block_6": 0.10242107510566711, "attnres/block_norm/6": 45749.734375, "geo/tier1_time_s": 1.3569090366363525, "geo/step": 4050.0, "geo/rankme_slope": 0.0017601668792517006} {"step": 4060, "timestamp": 1778330108.3916955, "train/loss": 2.383272624015808, "train/z_loss": 0.0013426298624835908, "train/perplexity": 10.840321172324524, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790592.5433129598, "perf/iters_per_sec": 0.8538210598530578, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1712055921554565, "data/tokens_consumed": 8516534272, "data/tokens_consumed_B": 8.516534272, "train/loss_slope": -7.462045690729267e-06} {"step": 4070, "timestamp": 1778330118.7350657, "train/loss": 2.3737098932266236, "train/z_loss": 0.0013404284603893758, "train/perplexity": 10.737152174159514, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028811.7332460927, "perf/iters_per_sec": 0.9674128214102233, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336848735809325, "data/tokens_consumed": 8537505792, "data/tokens_consumed_B": 8.537505792, "train/loss_slope": -6.237390353949238e-06} {"step": 4080, "timestamp": 1778330129.0792136, "train/loss": 2.339839053153992, "train/z_loss": 0.0013408696395345032, "train/perplexity": 10.379565869899386, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028458.2633117146, "perf/iters_per_sec": 0.9672442738112043, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338649988174438, "data/tokens_consumed": 8558477312, "data/tokens_consumed_B": 8.558477312, "train/loss_slope": -8.30023471135249e-06} {"step": 4090, "timestamp": 1778330139.4342499, "train/loss": 2.3981493949890136, "train/z_loss": 0.0013542172149755062, "train/perplexity": 11.002795699306644, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026663.200963627, "perf/iters_per_sec": 0.9663883213823448, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347807168960572, "data/tokens_consumed": 8579448832, "data/tokens_consumed_B": 8.579448832, "train/loss_slope": -6.355300208594365e-06} {"step": 4100, "timestamp": 1778330149.7684774, "grad/layer_0/attn": 0.004082163330167532, "grad/layer_0/mlp": 0.004156915470957756, "grad/layer_0/attn_mlp_ratio": 0.9820173781463162, "grad/layer_4/attn": 0.0020104702562093735, "grad/layer_4/mlp": 0.0027968452777713537, "grad/layer_4/attn_mlp_ratio": 0.7188349674916252, "grad/layer_8/attn": 0.004089986905455589, "grad/layer_8/mlp": 0.0036352514289319515, "grad/layer_8/attn_mlp_ratio": 1.1250904849099232, "grad/layer_12/attn": 0.0056145079433918, "grad/layer_12/mlp": 0.00750346202403307, "grad/layer_12/attn_mlp_ratio": 0.7482556519355139, "grad/layer_16/attn": 0.0055102016776800156, "grad/layer_16/mlp": 0.005573184695094824, "grad/layer_16/attn_mlp_ratio": 0.988698900228422, "grad/layer_20/attn": 0.0033162387553602457, "grad/layer_20/mlp": 0.006350281648337841, "grad/layer_20/attn_mlp_ratio": 0.5222191529105393, "grad/layer_24/attn": 0.011778343468904495, "grad/layer_24/mlp": 0.009673678316175938, "grad/layer_24/attn_mlp_ratio": 1.2175661586195816, "grad/layer_27/attn": 0.009420814923942089, "grad/layer_27/mlp": 0.009802151471376419, "grad/layer_27/attn_mlp_ratio": 0.9610966383596962} {"step": 4100, "timestamp": 1778330149.783982, "train/loss": 2.3938229084014893, "train/z_loss": 0.0013462396804243327, "train/perplexity": 10.955295080777756, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027500.1865225125, "perf/iters_per_sec": 0.9667874271977007, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034353542327881, "data/tokens_consumed": 8600420352, "data/tokens_consumed_B": 8.600420352, "train/loss_slope": -5.848940020382865e-06} {"step": 4110, "timestamp": 1778330160.1277297, "train/loss": 2.37853581905365, "train/z_loss": 0.001342770119663328, "train/perplexity": 10.789094107295123, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028633.6494751854, "perf/iters_per_sec": 0.9673279044509818, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337756156921387, "data/tokens_consumed": 8621391872, "data/tokens_consumed_B": 8.621391872, "train/loss_slope": -6.780006904842289e-06} {"step": 4120, "timestamp": 1778330170.4817197, "train/loss": 2.3867655992507935, "train/z_loss": 0.0013328063767403364, "train/perplexity": 10.878252353490835, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026673.5673775547, "perf/iters_per_sec": 0.9663932644737028, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347754240036011, "data/tokens_consumed": 8642363392, "data/tokens_consumed_B": 8.642363392, "train/loss_slope": -4.392040709350367e-06} {"step": 4125, "timestamp": 1778330176.241544, "eos/sharpness": 62.19899654388426, "eos/L0_probe": 2.363828420639038, "eos/L_plus": 2.6665332317352295, "eos/L_minus": 2.6831135749816895, "eos/grad_norm": 0.23075748980045319, "eos/embed_grad_frac": 0.05201883986592293, "eos/time_s": 0.5894591808319092} {"step": 4125, "timestamp": 1778330177.6192534, "geo/rankme_last": 426.3441162109375, "geo/layer_0/stable_rank_q_proj": 20.526790618896484, "geo/layer_0/stable_rank_k_proj": 17.15032958984375, "geo/layer_0/stable_rank_o_proj": 45.72576904296875, "geo/layer_0/stable_rank_gate_proj": 130.51649475097656, "geo/layer_0/stable_rank_down_proj": 56.43370819091797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06396590918302536, "geo/layer_0/attn_entropy_mean": 6.255759239196777, "geo/layer_0/attn_entropy_std": 0.42723456025123596, "geo/layer_7/stable_rank_q_proj": 42.56883239746094, "geo/layer_7/stable_rank_k_proj": 39.68632125854492, "geo/layer_7/stable_rank_o_proj": 89.71654510498047, "geo/layer_7/stable_rank_gate_proj": 79.46114349365234, "geo/layer_7/stable_rank_down_proj": 142.54981994628906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4086112976074219, "geo/layer_7/attn_entropy_mean": 4.721270561218262, "geo/layer_7/attn_entropy_std": 0.7592201828956604, "geo/layer_14/stable_rank_q_proj": 51.730552673339844, "geo/layer_14/stable_rank_k_proj": 41.57735061645508, "geo/layer_14/stable_rank_o_proj": 42.68250274658203, "geo/layer_14/stable_rank_gate_proj": 71.79069519042969, "geo/layer_14/stable_rank_down_proj": 127.44042205810547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.35535410046577454, "geo/layer_14/attn_entropy_mean": 5.489806175231934, "geo/layer_14/attn_entropy_std": 0.4310460090637207, "geo/layer_21/stable_rank_q_proj": 39.63478088378906, "geo/layer_21/stable_rank_k_proj": 29.200939178466797, "geo/layer_21/stable_rank_o_proj": 66.47574615478516, "geo/layer_21/stable_rank_gate_proj": 62.72423553466797, "geo/layer_21/stable_rank_down_proj": 49.75678253173828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13342350721359253, "geo/layer_21/attn_entropy_mean": 5.855103015899658, "geo/layer_21/attn_entropy_std": 0.3171810507774353, "geo/layer_27/stable_rank_q_proj": 43.778629302978516, "geo/layer_27/stable_rank_k_proj": 30.819927215576172, "geo/layer_27/stable_rank_o_proj": 110.43700408935547, "geo/layer_27/stable_rank_gate_proj": 73.85052490234375, "geo/layer_27/stable_rank_down_proj": 127.42113494873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10193513333797455, "geo/layer_27/attn_entropy_mean": 4.295153617858887, "geo/layer_27/attn_entropy_std": 0.6351082921028137, "attnres/final_alpha/block_0": 0.24984143674373627, "attnres/block_norm/0": 1.778308629989624, "attnres/final_alpha/block_1": 0.004010479897260666, "attnres/block_norm/1": 49521.640625, "attnres/final_alpha/block_2": 0.008743819780647755, "attnres/block_norm/2": 29561.576171875, "attnres/final_alpha/block_3": 0.010632000863552094, "attnres/block_norm/3": 68521.0390625, "attnres/final_alpha/block_4": 0.012129424139857292, "attnres/block_norm/4": 16729.79296875, "attnres/final_alpha/block_5": 0.6122792959213257, "attnres/block_norm/5": 6999.49609375, "attnres/final_alpha/block_6": 0.10236351191997528, "attnres/block_norm/6": 45829.25390625, "geo/tier1_time_s": 1.3586249351501465, "geo/step": 4125.0, "geo/rankme_slope": 0.0017014011463960584} {"step": 4130, "timestamp": 1778330182.799948, "train/loss": 2.389129972457886, "train/z_loss": 0.0013475490035489202, "train/perplexity": 10.904003032006509, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703429.6151833234, "perf/iters_per_sec": 0.8122585369030587, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.231135106086731, "data/tokens_consumed": 8663334912, "data/tokens_consumed_B": 8.663334912, "train/loss_slope": -3.072466680032761e-06} {"step": 4140, "timestamp": 1778330193.1414895, "train/loss": 2.3241273164749146, "train/z_loss": 0.0013509092037566005, "train/perplexity": 10.217759323631048, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028813.7454063573, "perf/iters_per_sec": 0.9674137808830058, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336838483810424, "data/tokens_consumed": 8684306432, "data/tokens_consumed_B": 8.684306432, "train/loss_slope": -7.154650921367976e-06} {"step": 4150, "timestamp": 1778330203.4820464, "grad/layer_0/attn": 0.003048927988857031, "grad/layer_0/mlp": 0.003285312093794346, "grad/layer_0/attn_mlp_ratio": 0.9280481759438804, "grad/layer_4/attn": 0.003034563735127449, "grad/layer_4/mlp": 0.002657547825947404, "grad/layer_4/attn_mlp_ratio": 1.141866043317221, "grad/layer_8/attn": 0.003869675798341632, "grad/layer_8/mlp": 0.003586405888199806, "grad/layer_8/attn_mlp_ratio": 1.0789843121704168, "grad/layer_12/attn": 0.005786340218037367, "grad/layer_12/mlp": 0.007234480697661638, "grad/layer_12/attn_mlp_ratio": 0.7998279876432396, "grad/layer_16/attn": 0.0039157625287771225, "grad/layer_16/mlp": 0.005266525316983461, "grad/layer_16/attn_mlp_ratio": 0.7435191551814397, "grad/layer_20/attn": 0.0035046518314629793, "grad/layer_20/mlp": 0.006608567666262388, "grad/layer_20/attn_mlp_ratio": 0.5303194210029426, "grad/layer_24/attn": 0.007069945335388184, "grad/layer_24/mlp": 0.009139114990830421, "grad/layer_24/attn_mlp_ratio": 0.7735918921167428, "grad/layer_27/attn": 0.005766876973211765, "grad/layer_27/mlp": 0.00870892871171236, "grad/layer_27/attn_mlp_ratio": 0.6621798269215471} {"step": 4150, "timestamp": 1778330203.4976058, "train/loss": 2.3807136535644533, "train/z_loss": 0.0013540981919504703, "train/perplexity": 10.812616573504457, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.6520237345, "perf/iters_per_sec": 0.9662307033651993, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349495172500611, "data/tokens_consumed": 8705277952, "data/tokens_consumed_B": 8.705277952, "train/loss_slope": -7.336055685226929e-06} {"step": 4160, "timestamp": 1778330213.866786, "train/loss": 2.375249814987183, "train/z_loss": 0.0013539225794374943, "train/perplexity": 10.753699285797651, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023665.5525905911, "perf/iters_per_sec": 0.9649589312508541, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036313533782959, "data/tokens_consumed": 8726249472, "data/tokens_consumed_B": 8.726249472, "train/loss_slope": -5.709202125771251e-06} {"step": 4170, "timestamp": 1778330224.2292793, "train/loss": 2.3326368808746336, "train/z_loss": 0.001350661856122315, "train/perplexity": 10.30507900391728, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025168.093323253, "perf/iters_per_sec": 0.965675398503901, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355446577072143, "data/tokens_consumed": 8747220992, "data/tokens_consumed_B": 8.747220992, "train/loss_slope": -8.252850417220898e-06} {"step": 4180, "timestamp": 1778330234.591348, "train/loss": 2.401390290260315, "train/z_loss": 0.0013548880815505981, "train/perplexity": 11.038512453727597, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025365.1092338588, "perf/iters_per_sec": 0.9657693430108351, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354439258575439, "data/tokens_consumed": 8768192512, "data/tokens_consumed_B": 8.768192512, "train/loss_slope": -8.004761629193226e-06} {"step": 4190, "timestamp": 1778330244.948756, "train/loss": 2.4040842294692992, "train/z_loss": 0.001352788833901286, "train/perplexity": 11.068289626173238, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026145.4821586716, "perf/iters_per_sec": 0.9661414538186415, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350451231002809, "data/tokens_consumed": 8789164032, "data/tokens_consumed_B": 8.789164032, "train/loss_slope": -6.346420050024949e-06} {"step": 4200, "timestamp": 1778330255.2785103, "grad/layer_0/attn": 0.0032908187713474035, "grad/layer_0/mlp": 0.003521491540595889, "grad/layer_0/attn_mlp_ratio": 0.9344956930781037, "grad/layer_4/attn": 0.0024584562052041292, "grad/layer_4/mlp": 0.0027427419554442167, "grad/layer_4/attn_mlp_ratio": 0.8963497680447955, "grad/layer_8/attn": 0.003438921645283699, "grad/layer_8/mlp": 0.003642416326329112, "grad/layer_8/attn_mlp_ratio": 0.9441319285805828, "grad/layer_12/attn": 0.011912234127521515, "grad/layer_12/mlp": 0.007476347498595715, "grad/layer_12/attn_mlp_ratio": 1.5933226713213542, "grad/layer_16/attn": 0.004315777216106653, "grad/layer_16/mlp": 0.005041488446295261, "grad/layer_16/attn_mlp_ratio": 0.856052171194181, "grad/layer_20/attn": 0.003300366224721074, "grad/layer_20/mlp": 0.0066069639287889, "grad/layer_20/attn_mlp_ratio": 0.49952840825834693, "grad/layer_24/attn": 0.010760246776044369, "grad/layer_24/mlp": 0.009686688892543316, "grad/layer_24/attn_mlp_ratio": 1.1108281461629939, "grad/layer_27/attn": 0.005525462795048952, "grad/layer_27/mlp": 0.007808270864188671, "grad/layer_27/attn_mlp_ratio": 0.7076422962766742} {"step": 4200, "timestamp": 1778330255.874955, "eos/sharpness": 41.35968685150146, "eos/L0_probe": 2.36142897605896, "eos/L_plus": 2.6082611083984375, "eos/L_minus": 2.528193712234497, "eos/grad_norm": 0.14324134588241577, "eos/embed_grad_frac": 0.1153697818517685, "eos/time_s": 0.5934350490570068} {"step": 4200, "timestamp": 1778330255.8953145, "train/loss": 2.380304980278015, "train/z_loss": 0.0013631909852847457, "train/perplexity": 10.808198648759765, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916553.3775841277, "perf/iters_per_sec": 0.9138838661118163, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0942309379577637, "data/tokens_consumed": 8810135552, "data/tokens_consumed_B": 8.810135552, "train/loss_slope": -7.95335224573935e-06} {"step": 4200, "timestamp": 1778330257.2581105, "geo/rankme_last": 426.4512634277344, "geo/layer_0/stable_rank_q_proj": 20.56540870666504, "geo/layer_0/stable_rank_k_proj": 17.192537307739258, "geo/layer_0/stable_rank_o_proj": 45.69661331176758, "geo/layer_0/stable_rank_gate_proj": 130.42105102539062, "geo/layer_0/stable_rank_down_proj": 56.460853576660156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06736665964126587, "geo/layer_0/attn_entropy_mean": 6.2609148025512695, "geo/layer_0/attn_entropy_std": 0.42777174711227417, "geo/layer_7/stable_rank_q_proj": 42.6104850769043, "geo/layer_7/stable_rank_k_proj": 39.62532424926758, "geo/layer_7/stable_rank_o_proj": 89.72640991210938, "geo/layer_7/stable_rank_gate_proj": 79.35535430908203, "geo/layer_7/stable_rank_down_proj": 142.6981201171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41353198885917664, "geo/layer_7/attn_entropy_mean": 4.741043567657471, "geo/layer_7/attn_entropy_std": 0.753684401512146, "geo/layer_14/stable_rank_q_proj": 51.63501739501953, "geo/layer_14/stable_rank_k_proj": 41.643619537353516, "geo/layer_14/stable_rank_o_proj": 42.67670822143555, "geo/layer_14/stable_rank_gate_proj": 71.8656005859375, "geo/layer_14/stable_rank_down_proj": 127.20924377441406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37639933824539185, "geo/layer_14/attn_entropy_mean": 5.497611045837402, "geo/layer_14/attn_entropy_std": 0.44180598855018616, "geo/layer_21/stable_rank_q_proj": 39.64581298828125, "geo/layer_21/stable_rank_k_proj": 29.126399993896484, "geo/layer_21/stable_rank_o_proj": 66.40180206298828, "geo/layer_21/stable_rank_gate_proj": 62.73038101196289, "geo/layer_21/stable_rank_down_proj": 49.773197174072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1385742872953415, "geo/layer_21/attn_entropy_mean": 5.886289596557617, "geo/layer_21/attn_entropy_std": 0.31821170449256897, "geo/layer_27/stable_rank_q_proj": 43.78616714477539, "geo/layer_27/stable_rank_k_proj": 30.734493255615234, "geo/layer_27/stable_rank_o_proj": 110.44625091552734, "geo/layer_27/stable_rank_gate_proj": 73.75350952148438, "geo/layer_27/stable_rank_down_proj": 127.13020324707031, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1026330292224884, "geo/layer_27/attn_entropy_mean": 4.309353828430176, "geo/layer_27/attn_entropy_std": 0.6523962616920471, "attnres/final_alpha/block_0": 0.24941259622573853, "attnres/block_norm/0": 1.7782213687896729, "attnres/final_alpha/block_1": 0.0040006451308727264, "attnres/block_norm/1": 49464.8984375, "attnres/final_alpha/block_2": 0.008557471446692944, "attnres/block_norm/2": 29721.810546875, "attnres/final_alpha/block_3": 0.010454827919602394, "attnres/block_norm/3": 68692.2109375, "attnres/final_alpha/block_4": 0.012006303295493126, "attnres/block_norm/4": 16703.783203125, "attnres/final_alpha/block_5": 0.6156615018844604, "attnres/block_norm/5": 6935.03515625, "attnres/final_alpha/block_6": 0.09990662336349487, "attnres/block_norm/6": 46003.1953125, "geo/tier1_time_s": 1.35898756980896, "geo/step": 4200.0, "geo/rankme_slope": 0.001689419048869548} {"step": 4210, "timestamp": 1778330267.9806352, "train/loss": 2.3869575023651124, "train/z_loss": 0.0013404564233496785, "train/perplexity": 10.880340124314271, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1735848.0832072624, "perf/iters_per_sec": 0.8277168670688927, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.208142590522766, "data/tokens_consumed": 8831107072, "data/tokens_consumed_B": 8.831107072, "train/loss_slope": -1.0010191101660152e-05} {"step": 4220, "timestamp": 1778330278.3277726, "train/loss": 2.3732823848724367, "train/z_loss": 0.0013341674581170081, "train/perplexity": 10.732562932944258, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028312.4197628868, "perf/iters_per_sec": 0.967174730187839, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339393377304078, "data/tokens_consumed": 8852078592, "data/tokens_consumed_B": 8.852078592, "train/loss_slope": -1.2367339324493287e-05} {"step": 4230, "timestamp": 1778330288.6708775, "train/loss": 2.3593430042266847, "train/z_loss": 0.0013440509093925357, "train/perplexity": 10.58399552735104, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028552.7594236361, "perf/iters_per_sec": 0.9672893330686742, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338168382644652, "data/tokens_consumed": 8873050112, "data/tokens_consumed_B": 8.873050112, "train/loss_slope": -1.3431282197967646e-05} {"step": 4240, "timestamp": 1778330299.0183866, "train/loss": 2.3792418956756594, "train/z_loss": 0.0013432903098873795, "train/perplexity": 10.796714724470158, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028058.6701779962, "perf/iters_per_sec": 0.9670537329568845, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340687036514282, "data/tokens_consumed": 8894021632, "data/tokens_consumed_B": 8.894021632, "train/loss_slope": -1.4007743743314987e-05} {"step": 4250, "timestamp": 1778330309.361172, "grad/layer_0/attn": 0.0033658258616924286, "grad/layer_0/mlp": 0.0033607978839427233, "grad/layer_0/attn_mlp_ratio": 1.0014960368858015, "grad/layer_4/attn": 0.0018173751886934042, "grad/layer_4/mlp": 0.00256336759775877, "grad/layer_4/attn_mlp_ratio": 0.7089795156123682, "grad/layer_8/attn": 0.006381358485668898, "grad/layer_8/mlp": 0.0034768003970384598, "grad/layer_8/attn_mlp_ratio": 1.8354111750457185, "grad/layer_12/attn": 0.007294607814401388, "grad/layer_12/mlp": 0.00637515215203166, "grad/layer_12/attn_mlp_ratio": 1.1442248790335499, "grad/layer_16/attn": 0.003524031722918153, "grad/layer_16/mlp": 0.004068722482770681, "grad/layer_16/attn_mlp_ratio": 0.8661273043880987, "grad/layer_20/attn": 0.002881933469325304, "grad/layer_20/mlp": 0.005723938811570406, "grad/layer_20/attn_mlp_ratio": 0.503487810378225, "grad/layer_24/attn": 0.012551870197057724, "grad/layer_24/mlp": 0.01081137266010046, "grad/layer_24/attn_mlp_ratio": 1.1609876447309815, "grad/layer_27/attn": 0.005417989566922188, "grad/layer_27/mlp": 0.011017614044249058, "grad/layer_27/attn_mlp_ratio": 0.491757062462589} {"step": 4250, "timestamp": 1778330309.3769114, "train/loss": 2.409302020072937, "train/z_loss": 0.00135876884451136, "train/perplexity": 11.12619257514567, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025933.9891556557, "perf/iters_per_sec": 0.9660406060961035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351531744003295, "data/tokens_consumed": 8914993152, "data/tokens_consumed_B": 8.914993152, "train/loss_slope": -1.1903270414703128e-05} {"step": 4260, "timestamp": 1778330319.724049, "train/loss": 2.37669723033905, "train/z_loss": 0.0013530591153539718, "train/perplexity": 10.769275625230305, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028299.2303285361, "perf/iters_per_sec": 0.9671684409754449, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339460611343383, "data/tokens_consumed": 8935964672, "data/tokens_consumed_B": 8.935964672, "train/loss_slope": -1.1757771059660508e-05} {"step": 4270, "timestamp": 1778330330.0817025, "train/loss": 2.335754418373108, "train/z_loss": 0.0013511678320355714, "train/perplexity": 10.337255603954858, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026082.4309710006, "perf/iters_per_sec": 0.9661113886694911, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350773334503174, "data/tokens_consumed": 8956936192, "data/tokens_consumed_B": 8.956936192, "train/loss_slope": -1.3774520199898938e-05} {"step": 4275, "timestamp": 1778330335.8388674, "eos/sharpness": 61.31365299224852, "eos/L0_probe": 2.3547189235687256, "eos/L_plus": 2.7130961418151855, "eos/L_minus": 2.609478235244751, "eos/grad_norm": 0.17376911640167236, "eos/embed_grad_frac": 0.08329714089632034, "eos/time_s": 0.5922691822052002} {"step": 4275, "timestamp": 1778330337.2185237, "geo/rankme_last": 427.4007263183594, "geo/layer_0/stable_rank_q_proj": 20.596561431884766, "geo/layer_0/stable_rank_k_proj": 17.20778465270996, "geo/layer_0/stable_rank_o_proj": 45.6678466796875, "geo/layer_0/stable_rank_gate_proj": 130.12376403808594, "geo/layer_0/stable_rank_down_proj": 56.41114044189453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06478270888328552, "geo/layer_0/attn_entropy_mean": 6.252593040466309, "geo/layer_0/attn_entropy_std": 0.43350332975387573, "geo/layer_7/stable_rank_q_proj": 42.61267852783203, "geo/layer_7/stable_rank_k_proj": 39.51814270019531, "geo/layer_7/stable_rank_o_proj": 89.73625946044922, "geo/layer_7/stable_rank_gate_proj": 79.36495208740234, "geo/layer_7/stable_rank_down_proj": 142.91893005371094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4166329503059387, "geo/layer_7/attn_entropy_mean": 4.7059807777404785, "geo/layer_7/attn_entropy_std": 0.7583251595497131, "geo/layer_14/stable_rank_q_proj": 51.593406677246094, "geo/layer_14/stable_rank_k_proj": 41.60734939575195, "geo/layer_14/stable_rank_o_proj": 42.6973876953125, "geo/layer_14/stable_rank_gate_proj": 71.87942504882812, "geo/layer_14/stable_rank_down_proj": 127.3049087524414, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36480873823165894, "geo/layer_14/attn_entropy_mean": 5.529492378234863, "geo/layer_14/attn_entropy_std": 0.4087960124015808, "geo/layer_21/stable_rank_q_proj": 39.5394401550293, "geo/layer_21/stable_rank_k_proj": 29.130043029785156, "geo/layer_21/stable_rank_o_proj": 66.4763412475586, "geo/layer_21/stable_rank_gate_proj": 62.60581588745117, "geo/layer_21/stable_rank_down_proj": 49.80756378173828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13527417182922363, "geo/layer_21/attn_entropy_mean": 5.860034465789795, "geo/layer_21/attn_entropy_std": 0.3147179186344147, "geo/layer_27/stable_rank_q_proj": 43.74955368041992, "geo/layer_27/stable_rank_k_proj": 30.69895362854004, "geo/layer_27/stable_rank_o_proj": 110.37948608398438, "geo/layer_27/stable_rank_gate_proj": 73.73993682861328, "geo/layer_27/stable_rank_down_proj": 127.04930877685547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1152728945016861, "geo/layer_27/attn_entropy_mean": 4.303488731384277, "geo/layer_27/attn_entropy_std": 0.6665770411491394, "attnres/final_alpha/block_0": 0.24941712617874146, "attnres/block_norm/0": 1.7781251668930054, "attnres/final_alpha/block_1": 0.004019567742943764, "attnres/block_norm/1": 49665.69921875, "attnres/final_alpha/block_2": 0.008686569519340992, "attnres/block_norm/2": 29585.48828125, "attnres/final_alpha/block_3": 0.010399636812508106, "attnres/block_norm/3": 68655.6015625, "attnres/final_alpha/block_4": 0.011808233335614204, "attnres/block_norm/4": 16639.4375, "attnres/final_alpha/block_5": 0.6143331527709961, "attnres/block_norm/5": 6982.6708984375, "attnres/final_alpha/block_6": 0.10133575648069382, "attnres/block_norm/6": 45616.65625, "geo/tier1_time_s": 1.360344409942627, "geo/step": 4275.0, "geo/rankme_slope": 0.001639650743109744} {"step": 4280, "timestamp": 1778330342.394056, "train/loss": 2.3564535856246946, "train/z_loss": 0.0013404550263658166, "train/perplexity": 10.553458072781346, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704209.945402891, "perf/iters_per_sec": 0.8126306273474173, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2305713891983032, "data/tokens_consumed": 8977907712, "data/tokens_consumed_B": 8.977907712, "train/loss_slope": -1.6672030791412215e-05} {"step": 4290, "timestamp": 1778330352.740663, "train/loss": 2.3911874532699584, "train/z_loss": 0.0013334196759387852, "train/perplexity": 10.926460904417986, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028254.3781810468, "perf/iters_per_sec": 0.9671470538048967, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339689254760742, "data/tokens_consumed": 8998879232, "data/tokens_consumed_B": 8.998879232, "train/loss_slope": -1.3539095451407576e-05} {"step": 4300, "timestamp": 1778330363.0751464, "grad/layer_0/attn": 0.0036530415527522564, "grad/layer_0/mlp": 0.0037217321805655956, "grad/layer_0/attn_mlp_ratio": 0.9815433452395184, "grad/layer_4/attn": 0.0018117023864760995, "grad/layer_4/mlp": 0.0027050271164625883, "grad/layer_4/attn_mlp_ratio": 0.6697538477432751, "grad/layer_8/attn": 0.004090829286724329, "grad/layer_8/mlp": 0.0036266001407057047, "grad/layer_8/attn_mlp_ratio": 1.128006677109879, "grad/layer_12/attn": 0.005843935534358025, "grad/layer_12/mlp": 0.007637779228389263, "grad/layer_12/attn_mlp_ratio": 0.7651354252454499, "grad/layer_16/attn": 0.005405709147453308, "grad/layer_16/mlp": 0.004545304458588362, "grad/layer_16/attn_mlp_ratio": 1.1892952557467706, "grad/layer_20/attn": 0.0029294458217918873, "grad/layer_20/mlp": 0.006087979767471552, "grad/layer_20/attn_mlp_ratio": 0.48118520191633607, "grad/layer_24/attn": 0.006138286553323269, "grad/layer_24/mlp": 0.007777696009725332, "grad/layer_24/attn_mlp_ratio": 0.7892165580560387, "grad/layer_27/attn": 0.004762269556522369, "grad/layer_27/mlp": 0.008284260518848896, "grad/layer_27/attn_mlp_ratio": 0.5748575250864199} {"step": 4300, "timestamp": 1778330363.0905986, "train/loss": 2.3870481491088866, "train/z_loss": 0.001355779601726681, "train/perplexity": 10.88132643642001, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027345.3222193453, "perf/iters_per_sec": 0.9667135821434714, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034432554244995, "data/tokens_consumed": 9019850752, "data/tokens_consumed_B": 9.019850752, "train/loss_slope": -1.3711992184249503e-05} {"step": 4310, "timestamp": 1778330373.4408174, "train/loss": 2.3730515956878664, "train/z_loss": 0.0013447779696434737, "train/perplexity": 10.730086259302352, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027120.638146315, "perf/iters_per_sec": 0.9666064444285941, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034547209739685, "data/tokens_consumed": 9040822272, "data/tokens_consumed_B": 9.040822272, "train/loss_slope": -1.3158203022565075e-05} {"step": 4320, "timestamp": 1778330383.787592, "train/loss": 2.393898868560791, "train/z_loss": 0.0013562221080064773, "train/perplexity": 10.956127278343818, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027789.6976112493, "perf/iters_per_sec": 0.9669254768425223, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342058658599853, "data/tokens_consumed": 9061793792, "data/tokens_consumed_B": 9.061793792, "train/loss_slope": -1.1648636582446349e-05} {"step": 4330, "timestamp": 1778330394.1310434, "train/loss": 2.3302030324935914, "train/z_loss": 0.00135485315695405, "train/perplexity": 10.280028500995872, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028744.2112054457, "perf/iters_per_sec": 0.9673806243922451, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033719277381897, "data/tokens_consumed": 9082765312, "data/tokens_consumed_B": 9.082765312, "train/loss_slope": -1.4892253852841905e-05} {"step": 4340, "timestamp": 1778330404.480208, "train/loss": 2.353497552871704, "train/z_loss": 0.0013479394256137311, "train/perplexity": 10.52230776840384, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027316.4454945282, "perf/iters_per_sec": 0.9666998126480714, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344472885131837, "data/tokens_consumed": 9103736832, "data/tokens_consumed_B": 9.103736832, "train/loss_slope": -1.5625548126673892e-05} {"step": 4350, "timestamp": 1778330415.2735403, "grad/layer_0/attn": 0.002904426073655486, "grad/layer_0/mlp": 0.0031431785319000483, "grad/layer_0/attn_mlp_ratio": 0.9240410469129983, "grad/layer_4/attn": 0.001713247736915946, "grad/layer_4/mlp": 0.0026233235839754343, "grad/layer_4/attn_mlp_ratio": 0.6530828610214269, "grad/layer_8/attn": 0.004292371217161417, "grad/layer_8/mlp": 0.0034141000360250473, "grad/layer_8/attn_mlp_ratio": 1.257248190194831, "grad/layer_12/attn": 0.006180855445563793, "grad/layer_12/mlp": 0.007113615516573191, "grad/layer_12/attn_mlp_ratio": 0.868876782035247, "grad/layer_16/attn": 0.003618153277784586, "grad/layer_16/mlp": 0.004866607021540403, "grad/layer_16/attn_mlp_ratio": 0.743465249489741, "grad/layer_20/attn": 0.003247962100431323, "grad/layer_20/mlp": 0.006307941861450672, "grad/layer_20/attn_mlp_ratio": 0.514900441424539, "grad/layer_24/attn": 0.006094356998801231, "grad/layer_24/mlp": 0.009080584160983562, "grad/layer_24/attn_mlp_ratio": 0.6711415062780467, "grad/layer_27/attn": 0.004120660480111837, "grad/layer_27/mlp": 0.00882491935044527, "grad/layer_27/attn_mlp_ratio": 0.4669346279306748} {"step": 4350, "timestamp": 1778330415.8741672, "eos/sharpness": 32.6484203338623, "eos/L0_probe": 2.3580753803253174, "eos/L_plus": 2.540952444076538, "eos/L_minus": 2.5016825199127197, "eos/grad_norm": 0.11384231597185135, "eos/embed_grad_frac": 0.1863042563199997, "eos/time_s": 0.5978200435638428} {"step": 4350, "timestamp": 1778330415.8954039, "train/loss": 2.360571789741516, "train/z_loss": 0.0013516686041839421, "train/perplexity": 10.59700898147862, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1838402.4974187186, "perf/iters_per_sec": 0.8766186225026696, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1407469272613526, "data/tokens_consumed": 9124708352, "data/tokens_consumed_B": 9.124708352, "train/loss_slope": -1.7283668498037217e-05} {"step": 4350, "timestamp": 1778330417.264397, "geo/rankme_last": 427.08251953125, "geo/layer_0/stable_rank_q_proj": 20.62217140197754, "geo/layer_0/stable_rank_k_proj": 17.198904037475586, "geo/layer_0/stable_rank_o_proj": 45.66370391845703, "geo/layer_0/stable_rank_gate_proj": 130.02139282226562, "geo/layer_0/stable_rank_down_proj": 56.49225616455078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06752505153417587, "geo/layer_0/attn_entropy_mean": 6.258755207061768, "geo/layer_0/attn_entropy_std": 0.4235442876815796, "geo/layer_7/stable_rank_q_proj": 42.567344665527344, "geo/layer_7/stable_rank_k_proj": 39.40324783325195, "geo/layer_7/stable_rank_o_proj": 89.7774887084961, "geo/layer_7/stable_rank_gate_proj": 79.37095642089844, "geo/layer_7/stable_rank_down_proj": 143.08059692382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4199843406677246, "geo/layer_7/attn_entropy_mean": 4.703330039978027, "geo/layer_7/attn_entropy_std": 0.7717511653900146, "geo/layer_14/stable_rank_q_proj": 51.64799880981445, "geo/layer_14/stable_rank_k_proj": 41.63627243041992, "geo/layer_14/stable_rank_o_proj": 42.701908111572266, "geo/layer_14/stable_rank_gate_proj": 71.93598937988281, "geo/layer_14/stable_rank_down_proj": 127.28962707519531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3828665316104889, "geo/layer_14/attn_entropy_mean": 5.524231910705566, "geo/layer_14/attn_entropy_std": 0.43370646238327026, "geo/layer_21/stable_rank_q_proj": 39.59270095825195, "geo/layer_21/stable_rank_k_proj": 29.09538459777832, "geo/layer_21/stable_rank_o_proj": 66.52333068847656, "geo/layer_21/stable_rank_gate_proj": 62.608497619628906, "geo/layer_21/stable_rank_down_proj": 49.814796447753906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1433832198381424, "geo/layer_21/attn_entropy_mean": 5.853401184082031, "geo/layer_21/attn_entropy_std": 0.3115076422691345, "geo/layer_27/stable_rank_q_proj": 43.69539260864258, "geo/layer_27/stable_rank_k_proj": 30.684505462646484, "geo/layer_27/stable_rank_o_proj": 110.17560577392578, "geo/layer_27/stable_rank_gate_proj": 73.6478271484375, "geo/layer_27/stable_rank_down_proj": 127.23016357421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10033217072486877, "geo/layer_27/attn_entropy_mean": 4.3218841552734375, "geo/layer_27/attn_entropy_std": 0.6490551233291626, "attnres/final_alpha/block_0": 0.24960370361804962, "attnres/block_norm/0": 1.77803635597229, "attnres/final_alpha/block_1": 0.004042860586196184, "attnres/block_norm/1": 49639.890625, "attnres/final_alpha/block_2": 0.008575222454965115, "attnres/block_norm/2": 29639.03125, "attnres/final_alpha/block_3": 0.010306360200047493, "attnres/block_norm/3": 69176.4140625, "attnres/final_alpha/block_4": 0.012243667617440224, "attnres/block_norm/4": 16654.53125, "attnres/final_alpha/block_5": 0.6140514612197876, "attnres/block_norm/5": 6976.3310546875, "attnres/final_alpha/block_6": 0.10117670893669128, "attnres/block_norm/6": 45644.1171875, "geo/tier1_time_s": 1.3646419048309326, "geo/step": 4350.0, "geo/rankme_slope": 0.001611699308629702} {"step": 4360, "timestamp": 1778330427.6100912, "train/loss": 2.375631856918335, "train/z_loss": 0.0013418958871625364, "train/perplexity": 10.75780843472344, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790829.4301274954, "perf/iters_per_sec": 0.8539340162885167, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1710506677627563, "data/tokens_consumed": 9145679872, "data/tokens_consumed_B": 9.145679872, "train/loss_slope": -1.7906679421833167e-05} {"step": 4370, "timestamp": 1778330437.9517655, "train/loss": 2.3779221534729005, "train/z_loss": 0.0013481211848556996, "train/perplexity": 10.782475242686418, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028872.0529931348, "perf/iters_per_sec": 0.9674415841069864, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336541414260865, "data/tokens_consumed": 9166651392, "data/tokens_consumed_B": 9.166651392, "train/loss_slope": -1.6673175501029397e-05} {"step": 4380, "timestamp": 1778330448.301506, "train/loss": 2.425672769546509, "train/z_loss": 0.0013343104626983404, "train/perplexity": 11.30983577341534, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027606.8388841825, "perf/iters_per_sec": 0.9668382830067551, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342991352081299, "data/tokens_consumed": 9187622912, "data/tokens_consumed_B": 9.187622912, "train/loss_slope": -1.2522728064737608e-05} {"step": 4390, "timestamp": 1778330458.650004, "train/loss": 2.339422011375427, "train/z_loss": 0.0013435695669613778, "train/perplexity": 10.375238059789812, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027803.3478983524, "perf/iters_per_sec": 0.9669319858066332, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341989040374755, "data/tokens_consumed": 9208594432, "data/tokens_consumed_B": 9.208594432, "train/loss_slope": -1.550812965894365e-05} {"step": 4400, "timestamp": 1778330468.9844193, "grad/layer_0/attn": 0.0030609352979809046, "grad/layer_0/mlp": 0.003380598733201623, "grad/layer_0/attn_mlp_ratio": 0.9054417424270426, "grad/layer_4/attn": 0.0018822751007974148, "grad/layer_4/mlp": 0.0026860577054321766, "grad/layer_4/attn_mlp_ratio": 0.7007574807179415, "grad/layer_8/attn": 0.004625958856195211, "grad/layer_8/mlp": 0.003549064276739955, "grad/layer_8/attn_mlp_ratio": 1.303430528483243, "grad/layer_12/attn": 0.0046502528712153435, "grad/layer_12/mlp": 0.0067605082876980305, "grad/layer_12/attn_mlp_ratio": 0.6878554991037835, "grad/layer_16/attn": 0.003335273126140237, "grad/layer_16/mlp": 0.00439217034727335, "grad/layer_16/attn_mlp_ratio": 0.7593678720302778, "grad/layer_20/attn": 0.005526610650122166, "grad/layer_20/mlp": 0.005226408131420612, "grad/layer_20/attn_mlp_ratio": 1.057439527378816, "grad/layer_24/attn": 0.007192519959062338, "grad/layer_24/mlp": 0.008114310912787914, "grad/layer_24/attn_mlp_ratio": 0.8863993440388395, "grad/layer_27/attn": 0.005334577057510614, "grad/layer_27/mlp": 0.007984345778822899, "grad/layer_27/attn_mlp_ratio": 0.6681295047174323} {"step": 4400, "timestamp": 1778330468.9999764, "train/loss": 2.3540632486343385, "train/z_loss": 0.0013354760245420038, "train/perplexity": 10.528261877269859, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027183.5670918298, "perf/iters_per_sec": 0.9666364512881421, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03451509475708, "data/tokens_consumed": 9229565952, "data/tokens_consumed_B": 9.229565952, "train/loss_slope": -1.619020315250974e-05} {"step": 4410, "timestamp": 1778330479.9214973, "train/loss": 2.3294710874557496, "train/z_loss": 0.00134318865602836, "train/perplexity": 10.272506838203382, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921182.8905944, "perf/iters_per_sec": 0.9160913899394989, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.09159414768219, "data/tokens_consumed": 9250537472, "data/tokens_consumed_B": 9.250537472, "train/loss_slope": -1.749184903460435e-05} {"step": 4420, "timestamp": 1778330490.2926753, "train/loss": 2.3875286102294924, "train/z_loss": 0.0013382290839217604, "train/perplexity": 10.88655574685291, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023153.4589389523, "perf/iters_per_sec": 0.9647147459692728, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365758419036866, "data/tokens_consumed": 9271508992, "data/tokens_consumed_B": 9.271508992, "train/loss_slope": -1.5660621710021002e-05} {"step": 4425, "timestamp": 1778330496.0662224, "eos/sharpness": 44.316506385803216, "eos/L0_probe": 2.3603973388671875, "eos/L_plus": 2.5399959087371826, "eos/L_minus": 2.6239638328552246, "eos/grad_norm": 0.12140383571386337, "eos/embed_grad_frac": 0.1599581241607666, "eos/time_s": 0.5953710079193115} {"step": 4425, "timestamp": 1778330497.442989, "geo/rankme_last": 427.7271423339844, "geo/layer_0/stable_rank_q_proj": 20.634334564208984, "geo/layer_0/stable_rank_k_proj": 17.153602600097656, "geo/layer_0/stable_rank_o_proj": 45.61608123779297, "geo/layer_0/stable_rank_gate_proj": 130.21649169921875, "geo/layer_0/stable_rank_down_proj": 56.436527252197266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.065156951546669, "geo/layer_0/attn_entropy_mean": 6.25483512878418, "geo/layer_0/attn_entropy_std": 0.4276338815689087, "geo/layer_7/stable_rank_q_proj": 42.577964782714844, "geo/layer_7/stable_rank_k_proj": 39.46845626831055, "geo/layer_7/stable_rank_o_proj": 90.09538269042969, "geo/layer_7/stable_rank_gate_proj": 79.27012634277344, "geo/layer_7/stable_rank_down_proj": 142.86080932617188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4215468764305115, "geo/layer_7/attn_entropy_mean": 4.759188652038574, "geo/layer_7/attn_entropy_std": 0.765252947807312, "geo/layer_14/stable_rank_q_proj": 51.71843338012695, "geo/layer_14/stable_rank_k_proj": 41.65935134887695, "geo/layer_14/stable_rank_o_proj": 42.646202087402344, "geo/layer_14/stable_rank_gate_proj": 71.8796157836914, "geo/layer_14/stable_rank_down_proj": 127.24085235595703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3835885226726532, "geo/layer_14/attn_entropy_mean": 5.535109996795654, "geo/layer_14/attn_entropy_std": 0.44690200686454773, "geo/layer_21/stable_rank_q_proj": 39.553375244140625, "geo/layer_21/stable_rank_k_proj": 29.093908309936523, "geo/layer_21/stable_rank_o_proj": 66.46820068359375, "geo/layer_21/stable_rank_gate_proj": 62.644996643066406, "geo/layer_21/stable_rank_down_proj": 49.77288055419922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13375027477741241, "geo/layer_21/attn_entropy_mean": 5.871178150177002, "geo/layer_21/attn_entropy_std": 0.32273486256599426, "geo/layer_27/stable_rank_q_proj": 43.752159118652344, "geo/layer_27/stable_rank_k_proj": 30.684236526489258, "geo/layer_27/stable_rank_o_proj": 110.25641632080078, "geo/layer_27/stable_rank_gate_proj": 73.6595458984375, "geo/layer_27/stable_rank_down_proj": 127.50846099853516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11073757708072662, "geo/layer_27/attn_entropy_mean": 4.305603981018066, "geo/layer_27/attn_entropy_std": 0.6416972279548645, "attnres/final_alpha/block_0": 0.24995389580726624, "attnres/block_norm/0": 1.7780306339263916, "attnres/final_alpha/block_1": 0.004008378833532333, "attnres/block_norm/1": 49741.171875, "attnres/final_alpha/block_2": 0.008602923713624477, "attnres/block_norm/2": 29650.8671875, "attnres/final_alpha/block_3": 0.010252060368657112, "attnres/block_norm/3": 68801.5390625, "attnres/final_alpha/block_4": 0.012154050171375275, "attnres/block_norm/4": 16784.287109375, "attnres/final_alpha/block_5": 0.6111801862716675, "attnres/block_norm/5": 7011.48583984375, "attnres/final_alpha/block_6": 0.10384854674339294, "attnres/block_norm/6": 45720.0078125, "geo/tier1_time_s": 1.3572900295257568, "geo/step": 4425.0, "geo/rankme_slope": 0.0016228504487732593} {"step": 4430, "timestamp": 1778330502.634711, "train/loss": 2.3415499925613403, "train/z_loss": 0.0013480112073011696, "train/perplexity": 10.397339878967795, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700054.1692697585, "perf/iters_per_sec": 0.8106489988659661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2335795164108276, "data/tokens_consumed": 9292480512, "data/tokens_consumed_B": 9.292480512, "train/loss_slope": -1.7623934501146622e-05} {"step": 4440, "timestamp": 1778330513.0119665, "train/loss": 2.3309681177139283, "train/z_loss": 0.001349322556052357, "train/perplexity": 10.287896608369156, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021937.4263895904, "perf/iters_per_sec": 0.9641348964641525, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371992588043213, "data/tokens_consumed": 9313452032, "data/tokens_consumed_B": 9.313452032, "train/loss_slope": -2.0237385386144475e-05} {"step": 4450, "timestamp": 1778330523.3779798, "grad/layer_0/attn": 0.00343682081438601, "grad/layer_0/mlp": 0.0035712721291929483, "grad/layer_0/attn_mlp_ratio": 0.9623519557798252, "grad/layer_4/attn": 0.0031785222236067057, "grad/layer_4/mlp": 0.002586664166301489, "grad/layer_4/attn_mlp_ratio": 1.2288112782999387, "grad/layer_8/attn": 0.006435415241867304, "grad/layer_8/mlp": 0.0035127669107168913, "grad/layer_8/attn_mlp_ratio": 1.8320074238439041, "grad/layer_12/attn": 0.0048189666122198105, "grad/layer_12/mlp": 0.007086430210620165, "grad/layer_12/attn_mlp_ratio": 0.6800273764066808, "grad/layer_16/attn": 0.007282927166670561, "grad/layer_16/mlp": 0.005309275351464748, "grad/layer_16/attn_mlp_ratio": 1.3717365454567463, "grad/layer_20/attn": 0.008159427903592587, "grad/layer_20/mlp": 0.006751236040145159, "grad/layer_20/attn_mlp_ratio": 1.2085828038325956, "grad/layer_24/attn": 0.013010570779442787, "grad/layer_24/mlp": 0.01181154977530241, "grad/layer_24/attn_mlp_ratio": 1.1015125802116361, "grad/layer_27/attn": 0.0045281159691512585, "grad/layer_27/mlp": 0.01227552630007267, "grad/layer_27/attn_mlp_ratio": 0.3688734659170667} {"step": 4450, "timestamp": 1778330523.3935168, "train/loss": 2.377650785446167, "train/z_loss": 0.0013369624270126224, "train/perplexity": 10.7795496206346, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021466.5764649222, "perf/iters_per_sec": 0.9639103777241336, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374408483505249, "data/tokens_consumed": 9334423552, "data/tokens_consumed_B": 9.334423552, "train/loss_slope": -1.7499117589447527e-05} {"step": 4460, "timestamp": 1778330533.7691445, "train/loss": 2.3525022745132445, "train/z_loss": 0.001347435766365379, "train/perplexity": 10.511840353060945, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022865.4561724938, "perf/iters_per_sec": 0.96457741554856, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367234230041504, "data/tokens_consumed": 9355395072, "data/tokens_consumed_B": 9.355395072, "train/loss_slope": -1.870732473390009e-05} {"step": 4470, "timestamp": 1778330544.145161, "train/loss": 2.356679630279541, "train/z_loss": 0.0013458098750561477, "train/perplexity": 10.555843895209891, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022091.5585393223, "perf/iters_per_sec": 0.9642083924004184, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371201992034913, "data/tokens_consumed": 9376366592, "data/tokens_consumed_B": 9.376366592, "train/loss_slope": -1.9452938245217633e-05} {"step": 4480, "timestamp": 1778330554.5206828, "train/loss": 2.3440714359283445, "train/z_loss": 0.001350402634125203, "train/perplexity": 10.423589261899258, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022492.1982103249, "perf/iters_per_sec": 0.9643994322826027, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369147539138794, "data/tokens_consumed": 9397338112, "data/tokens_consumed_B": 9.397338112, "train/loss_slope": -2.0437595402911874e-05} {"step": 4490, "timestamp": 1778330564.8986197, "train/loss": 2.4048436641693116, "train/z_loss": 0.0013425665441900493, "train/perplexity": 11.07669846196184, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022426.7701652625, "perf/iters_per_sec": 0.9643682337595284, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036948299407959, "data/tokens_consumed": 9418309632, "data/tokens_consumed_B": 9.418309632, "train/loss_slope": -1.8599455241430215e-05} {"step": 4500, "timestamp": 1778330575.2688847, "grad/layer_0/attn": 0.004372239112854004, "grad/layer_0/mlp": 0.0037083264905959368, "grad/layer_0/attn_mlp_ratio": 1.1790329158013493, "grad/layer_4/attn": 0.0016977866180241108, "grad/layer_4/mlp": 0.002838031155988574, "grad/layer_4/attn_mlp_ratio": 0.5982268921251607, "grad/layer_8/attn": 0.005388778168708086, "grad/layer_8/mlp": 0.003634514519944787, "grad/layer_8/attn_mlp_ratio": 1.4826679026510419, "grad/layer_12/attn": 0.009133760817348957, "grad/layer_12/mlp": 0.00718143954873085, "grad/layer_12/attn_mlp_ratio": 1.2718565168145823, "grad/layer_16/attn": 0.0070582302287220955, "grad/layer_16/mlp": 0.004506904631853104, "grad/layer_16/attn_mlp_ratio": 1.5660926175867853, "grad/layer_20/attn": 0.0025944283697754145, "grad/layer_20/mlp": 0.005571382120251656, "grad/layer_20/attn_mlp_ratio": 0.4656705045912693, "grad/layer_24/attn": 0.005482981447130442, "grad/layer_24/mlp": 0.007475441321730614, "grad/layer_24/attn_mlp_ratio": 0.7334659102794623, "grad/layer_27/attn": 0.005068894010037184, "grad/layer_27/mlp": 0.006661965511739254, "grad/layer_27/attn_mlp_ratio": 0.7608706357032421} {"step": 4500, "timestamp": 1778330575.8685179, "eos/sharpness": 9.357738494873045, "eos/L0_probe": 2.36262583732605, "eos/L_plus": 2.424973487854004, "eos/L_minus": 2.393855571746826, "eos/grad_norm": 0.09979725629091263, "eos/embed_grad_frac": 0.23798035085201263, "eos/time_s": 0.5967607498168945} {"step": 4500, "timestamp": 1778330575.8896525, "train/loss": 2.3253363132476808, "train/z_loss": 0.0013461487833410501, "train/perplexity": 10.230120032200995, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908794.1993338605, "perf/iters_per_sec": 0.9101840016049674, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0986789464950562, "data/tokens_consumed": 9439281152, "data/tokens_consumed_B": 9.439281152, "train/loss_slope": -2.3297798429707167e-05} {"step": 4500, "timestamp": 1778330577.2527437, "geo/rankme_last": 426.2857971191406, "geo/layer_0/stable_rank_q_proj": 20.624744415283203, "geo/layer_0/stable_rank_k_proj": 17.152070999145508, "geo/layer_0/stable_rank_o_proj": 45.63906478881836, "geo/layer_0/stable_rank_gate_proj": 129.87034606933594, "geo/layer_0/stable_rank_down_proj": 56.449283599853516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06452780216932297, "geo/layer_0/attn_entropy_mean": 6.25154972076416, "geo/layer_0/attn_entropy_std": 0.4248892068862915, "geo/layer_7/stable_rank_q_proj": 42.6386833190918, "geo/layer_7/stable_rank_k_proj": 39.481624603271484, "geo/layer_7/stable_rank_o_proj": 90.045654296875, "geo/layer_7/stable_rank_gate_proj": 79.40692901611328, "geo/layer_7/stable_rank_down_proj": 142.82545471191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4111520051956177, "geo/layer_7/attn_entropy_mean": 4.733620643615723, "geo/layer_7/attn_entropy_std": 0.7728375196456909, "geo/layer_14/stable_rank_q_proj": 51.74650192260742, "geo/layer_14/stable_rank_k_proj": 41.512611389160156, "geo/layer_14/stable_rank_o_proj": 42.7205924987793, "geo/layer_14/stable_rank_gate_proj": 71.81301879882812, "geo/layer_14/stable_rank_down_proj": 127.2702865600586, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802506923675537, "geo/layer_14/attn_entropy_mean": 5.524364948272705, "geo/layer_14/attn_entropy_std": 0.44553807377815247, "geo/layer_21/stable_rank_q_proj": 39.603092193603516, "geo/layer_21/stable_rank_k_proj": 29.182283401489258, "geo/layer_21/stable_rank_o_proj": 66.37372589111328, "geo/layer_21/stable_rank_gate_proj": 62.58566665649414, "geo/layer_21/stable_rank_down_proj": 49.76927185058594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.133857861161232, "geo/layer_21/attn_entropy_mean": 5.870268821716309, "geo/layer_21/attn_entropy_std": 0.32726022601127625, "geo/layer_27/stable_rank_q_proj": 43.661155700683594, "geo/layer_27/stable_rank_k_proj": 30.607662200927734, "geo/layer_27/stable_rank_o_proj": 110.07048034667969, "geo/layer_27/stable_rank_gate_proj": 73.5862045288086, "geo/layer_27/stable_rank_down_proj": 127.3933334350586, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10015781968832016, "geo/layer_27/attn_entropy_mean": 4.3150835037231445, "geo/layer_27/attn_entropy_std": 0.6544133424758911, "attnres/final_alpha/block_0": 0.2512524724006653, "attnres/block_norm/0": 1.7779544591903687, "attnres/final_alpha/block_1": 0.004034892190247774, "attnres/block_norm/1": 49646.73828125, "attnres/final_alpha/block_2": 0.008667746558785439, "attnres/block_norm/2": 29812.212890625, "attnres/final_alpha/block_3": 0.01042400673031807, "attnres/block_norm/3": 68768.421875, "attnres/final_alpha/block_4": 0.012128927744925022, "attnres/block_norm/4": 16732.822265625, "attnres/final_alpha/block_5": 0.6103179454803467, "attnres/block_norm/5": 7034.5615234375, "attnres/final_alpha/block_6": 0.10317403078079224, "attnres/block_norm/6": 46307.15625, "geo/tier1_time_s": 1.3594212532043457, "geo/step": 4500.0, "geo/rankme_slope": 0.0015380485788065227} {"step": 4500, "timestamp": 1778330584.1871827, "geo/ww_alpha_mean": 7.811358067464131, "geo/ww_alpha_std": 4.810209243434604, "geo/ww_alpha_min": 1.3413912070962524, "geo/ww_alpha_max": 31.349238186845156, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9054076328478495, "geo/ww_alpha_by_type/k_proj": 4.5291199139529645, "geo/ww_alpha_by_type/v_proj": 8.724470986989674, "geo/ww_alpha_by_type/o_proj": 9.299753741996472, "geo/ww_alpha_by_type/gate_proj": 7.985269113527617, "geo/ww_alpha_by_type/up_proj": 12.113002787397578, "geo/ww_alpha_by_type/down_proj": 8.22304448916716, "geo/twonn_id/layer_0": 0.7337880730628967, "geo/twonn_id/layer_7": 3.19874906539917, "geo/twonn_id/layer_14": 5.433222770690918, "geo/twonn_id/layer_21": 7.69130277633667, "geo/twonn_id/layer_27": 6.609893321990967, "geo/tier2_time_s": 6.928163766860962} {"step": 4500, "timestamp": 1778330584.9439719, "eoc/jacobian_sigma/layer_0/attn": 1632.33837890625, "eoc/jacobian_sigma/layer_0/mlp": 11798.705078125, "eoc/jacobian_sigma/layer_0": 11798.705078125, "eoc/jacobian_sigma/layer_7/attn": 1.110836148262024, "eoc/jacobian_sigma/layer_7/mlp": 1.8193808794021606, "eoc/jacobian_sigma/layer_7": 1.8193808794021606, "eoc/jacobian_sigma/layer_14/attn": 1.794953465461731, "eoc/jacobian_sigma/layer_14/mlp": 13.046612739562988, "eoc/jacobian_sigma/layer_14": 13.046612739562988, "eoc/jacobian_sigma/layer_21/attn": 1.0717304944992065, "eoc/jacobian_sigma/layer_21/mlp": 4.9205732345581055, "eoc/jacobian_sigma/layer_21": 4.9205732345581055, "eoc/jacobian_sigma/layer_27/attn": 3.4135656356811523, "eoc/jacobian_sigma/layer_27/mlp": 31.333282470703125, "eoc/jacobian_sigma/layer_27": 31.333282470703125, "eoc/layer0_sigma": 11798.705078125, "eoc/sigma_max": 31.333282470703125, "eoc/sigma_min": 1.8193808794021606, "eoc/sigma_mean": 12.779962331056595, "eoc/time_s": 0.7479126453399658} {"step": 4510, "timestamp": 1778330595.3403683, "train/loss": 2.3360546112060545, "train/z_loss": 0.001337733317632228, "train/perplexity": 10.340359239820819, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1078386.7855184434, "perf/iters_per_sec": 0.5142148902504174, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.944712257385254, "data/tokens_consumed": 9460252672, "data/tokens_consumed_B": 9.460252672, "train/loss_slope": -2.6017317598802423e-05} {"step": 4520, "timestamp": 1778330605.7217429, "train/loss": 2.3710160732269285, "train/z_loss": 0.001359591935761273, "train/perplexity": 10.708267141899208, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021117.100739559, "perf/iters_per_sec": 0.9637437347123904, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037620234489441, "data/tokens_consumed": 9481224192, "data/tokens_consumed_B": 9.481224192, "train/loss_slope": -2.668065209307186e-05} {"step": 4530, "timestamp": 1778330616.1042347, "train/loss": 2.3967049360275268, "train/z_loss": 0.0013540248852223157, "train/perplexity": 10.986914085388511, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021156.0182921416, "perf/iters_per_sec": 0.9637622920475681, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376002550125123, "data/tokens_consumed": 9502195712, "data/tokens_consumed_B": 9.502195712, "train/loss_slope": -2.5955927253949775e-05} {"step": 4540, "timestamp": 1778330626.4791386, "train/loss": 2.362635374069214, "train/z_loss": 0.0013471278245560825, "train/perplexity": 10.618899381708557, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022633.8572238411, "perf/iters_per_sec": 0.9644669805640417, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036842131614685, "data/tokens_consumed": 9523167232, "data/tokens_consumed_B": 9.523167232, "train/loss_slope": -2.5221351986349167e-05} {"step": 4550, "timestamp": 1778330636.8478632, "grad/layer_0/attn": 0.002604798413813114, "grad/layer_0/mlp": 0.003084411844611168, "grad/layer_0/attn_mlp_ratio": 0.8445040612567996, "grad/layer_4/attn": 0.0020786321256309748, "grad/layer_4/mlp": 0.002685733139514923, "grad/layer_4/attn_mlp_ratio": 0.7739533081872277, "grad/layer_8/attn": 0.005521632265299559, "grad/layer_8/mlp": 0.003818113822489977, "grad/layer_8/attn_mlp_ratio": 1.4461674998158869, "grad/layer_12/attn": 0.005033220164477825, "grad/layer_12/mlp": 0.006497598718851805, "grad/layer_12/attn_mlp_ratio": 0.77462772091664, "grad/layer_16/attn": 0.0035206859465688467, "grad/layer_16/mlp": 0.004571235738694668, "grad/layer_16/attn_mlp_ratio": 0.7701825219269788, "grad/layer_20/attn": 0.003179313847795129, "grad/layer_20/mlp": 0.006059298291802406, "grad/layer_20/attn_mlp_ratio": 0.5246999969660524, "grad/layer_24/attn": 0.00810231827199459, "grad/layer_24/mlp": 0.009887032210826874, "grad/layer_24/attn_mlp_ratio": 0.8194894096909224, "grad/layer_27/attn": 0.00519855972379446, "grad/layer_27/mlp": 0.010306733660399914, "grad/layer_27/attn_mlp_ratio": 0.5043847881050485} {"step": 4550, "timestamp": 1778330636.863574, "train/loss": 2.3873451709747315, "train/z_loss": 0.0013485749834217132, "train/perplexity": 10.884558908334679, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020438.4673101176, "perf/iters_per_sec": 0.9634201370764339, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379687547683716, "data/tokens_consumed": 9544138752, "data/tokens_consumed_B": 9.544138752, "train/loss_slope": -2.2288497660992003e-05} {"step": 4560, "timestamp": 1778330647.244964, "train/loss": 2.3080965757369993, "train/z_loss": 0.0013565973495133221, "train/perplexity": 10.05526698905662, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021466.3906402958, "perf/iters_per_sec": 0.9639102891160468, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374409437179566, "data/tokens_consumed": 9565110272, "data/tokens_consumed_B": 9.565110272, "train/loss_slope": -2.3560717153792438e-05} {"step": 4570, "timestamp": 1778330657.6218362, "train/loss": 2.4095436096191407, "train/z_loss": 0.0013362481258809566, "train/perplexity": 11.12888087167997, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022197.410177371, "perf/iters_per_sec": 0.9642588663946967, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370659112930298, "data/tokens_consumed": 9586081792, "data/tokens_consumed_B": 9.586081792, "train/loss_slope": -1.9633350163438925e-05} {"step": 4575, "timestamp": 1778330663.4135282, "eos/sharpness": 66.50156974792479, "eos/L0_probe": 2.3567216396331787, "eos/L_plus": 2.6661360263824463, "eos/L_minus": 2.712322950363159, "eos/grad_norm": 0.2690642476081848, "eos/embed_grad_frac": 0.0356857031583786, "eos/time_s": 0.6122581958770752} {"step": 4575, "timestamp": 1778330664.79117, "geo/rankme_last": 426.9310607910156, "geo/layer_0/stable_rank_q_proj": 20.616710662841797, "geo/layer_0/stable_rank_k_proj": 17.17346954345703, "geo/layer_0/stable_rank_o_proj": 45.595619201660156, "geo/layer_0/stable_rank_gate_proj": 129.9459228515625, "geo/layer_0/stable_rank_down_proj": 56.49507141113281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06804527342319489, "geo/layer_0/attn_entropy_mean": 6.255990982055664, "geo/layer_0/attn_entropy_std": 0.4222509264945984, "geo/layer_7/stable_rank_q_proj": 42.5840950012207, "geo/layer_7/stable_rank_k_proj": 39.35675811767578, "geo/layer_7/stable_rank_o_proj": 90.03372192382812, "geo/layer_7/stable_rank_gate_proj": 79.4790267944336, "geo/layer_7/stable_rank_down_proj": 142.49755859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4057044982910156, "geo/layer_7/attn_entropy_mean": 4.736248016357422, "geo/layer_7/attn_entropy_std": 0.7665413618087769, "geo/layer_14/stable_rank_q_proj": 51.78738021850586, "geo/layer_14/stable_rank_k_proj": 41.54338073730469, "geo/layer_14/stable_rank_o_proj": 42.74489974975586, "geo/layer_14/stable_rank_gate_proj": 71.83427429199219, "geo/layer_14/stable_rank_down_proj": 127.24484252929688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36900195479393005, "geo/layer_14/attn_entropy_mean": 5.506674289703369, "geo/layer_14/attn_entropy_std": 0.4447806775569916, "geo/layer_21/stable_rank_q_proj": 39.638240814208984, "geo/layer_21/stable_rank_k_proj": 29.269872665405273, "geo/layer_21/stable_rank_o_proj": 66.35172271728516, "geo/layer_21/stable_rank_gate_proj": 62.5845832824707, "geo/layer_21/stable_rank_down_proj": 49.76611328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13497698307037354, "geo/layer_21/attn_entropy_mean": 5.8622050285339355, "geo/layer_21/attn_entropy_std": 0.3076198995113373, "geo/layer_27/stable_rank_q_proj": 43.647247314453125, "geo/layer_27/stable_rank_k_proj": 30.63779640197754, "geo/layer_27/stable_rank_o_proj": 109.7889633178711, "geo/layer_27/stable_rank_gate_proj": 73.5911636352539, "geo/layer_27/stable_rank_down_proj": 127.28196716308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1021505743265152, "geo/layer_27/attn_entropy_mean": 4.297463417053223, "geo/layer_27/attn_entropy_std": 0.655512273311615, "attnres/final_alpha/block_0": 0.2513793110847473, "attnres/block_norm/0": 1.778210997581482, "attnres/final_alpha/block_1": 0.003985195886343718, "attnres/block_norm/1": 49547.5390625, "attnres/final_alpha/block_2": 0.008650125004351139, "attnres/block_norm/2": 29681.83203125, "attnres/final_alpha/block_3": 0.010573035106062889, "attnres/block_norm/3": 68843.90625, "attnres/final_alpha/block_4": 0.012158993631601334, "attnres/block_norm/4": 16794.57421875, "attnres/final_alpha/block_5": 0.6100116968154907, "attnres/block_norm/5": 6986.0185546875, "attnres/final_alpha/block_6": 0.10324171185493469, "attnres/block_norm/6": 45961.6953125, "geo/tier1_time_s": 1.3565151691436768, "geo/step": 4575.0, "geo/rankme_slope": 0.0014775705594737894} {"step": 4580, "timestamp": 1778330669.9800131, "train/loss": 2.3951511859893797, "train/z_loss": 0.0013406801968812942, "train/perplexity": 10.969856422313178, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697689.318383017, "perf/iters_per_sec": 0.8095213500895582, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2352978706359863, "data/tokens_consumed": 9607053312, "data/tokens_consumed_B": 9.607053312, "train/loss_slope": -2.1263239657667914e-05} {"step": 4590, "timestamp": 1778330680.370085, "train/loss": 2.3627617359161377, "train/z_loss": 0.0013514323974959553, "train/perplexity": 10.620241290227963, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019249.752935606, "perf/iters_per_sec": 0.9628533138921767, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038579797744751, "data/tokens_consumed": 9628024832, "data/tokens_consumed_B": 9.628024832, "train/loss_slope": -2.1881832219515035e-05} {"step": 4600, "timestamp": 1778330690.7342863, "grad/layer_0/attn": 0.003171028569340706, "grad/layer_0/mlp": 0.0031381279695779085, "grad/layer_0/attn_mlp_ratio": 1.0104841163372986, "grad/layer_4/attn": 0.0027868319302797318, "grad/layer_4/mlp": 0.002500188536942005, "grad/layer_4/attn_mlp_ratio": 1.1146486665455457, "grad/layer_8/attn": 0.002958382945507765, "grad/layer_8/mlp": 0.0034254654310643673, "grad/layer_8/attn_mlp_ratio": 0.8636440561667337, "grad/layer_12/attn": 0.005218531470745802, "grad/layer_12/mlp": 0.006748076993972063, "grad/layer_12/attn_mlp_ratio": 0.7733360775334686, "grad/layer_16/attn": 0.004271834157407284, "grad/layer_16/mlp": 0.004514950327575207, "grad/layer_16/attn_mlp_ratio": 0.9461530588058992, "grad/layer_20/attn": 0.003047884674742818, "grad/layer_20/mlp": 0.005777465645223856, "grad/layer_20/attn_mlp_ratio": 0.5275469919077337, "grad/layer_24/attn": 0.012192091904580593, "grad/layer_24/mlp": 0.010691134259104729, "grad/layer_24/attn_mlp_ratio": 1.1403927305616197, "grad/layer_27/attn": 0.006764167919754982, "grad/layer_27/mlp": 0.011082264594733715, "grad/layer_27/attn_mlp_ratio": 0.610359714920842} {"step": 4600, "timestamp": 1778330690.7504094, "train/loss": 2.379685616493225, "train/z_loss": 0.0013472306891344488, "train/perplexity": 10.801506514584617, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021549.736413708, "perf/iters_per_sec": 0.9639500314777889, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373981714248657, "data/tokens_consumed": 9648996352, "data/tokens_consumed_B": 9.648996352, "train/loss_slope": -1.9333279587075885e-05} {"step": 4610, "timestamp": 1778330701.1258883, "train/loss": 2.371464467048645, "train/z_loss": 0.0013516460312530398, "train/perplexity": 10.713069739373978, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022417.749146425, "perf/iters_per_sec": 0.9643639322025418, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369529247283935, "data/tokens_consumed": 9669967872, "data/tokens_consumed_B": 9.669967872, "train/loss_slope": -2.2090847271181942e-05} {"step": 4620, "timestamp": 1778330711.5151286, "train/loss": 2.3833800554275513, "train/z_loss": 0.0013516916427761317, "train/perplexity": 10.841485825890883, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019833.3370857686, "perf/iters_per_sec": 0.9631315884999125, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0382797241210937, "data/tokens_consumed": 9690939392, "data/tokens_consumed_B": 9.690939392, "train/loss_slope": -1.9914835607401113e-05} {"step": 4630, "timestamp": 1778330721.8985617, "train/loss": 2.414974737167358, "train/z_loss": 0.0013507828116416931, "train/perplexity": 11.189487675827548, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020909.9526897026, "perf/iters_per_sec": 0.9636449588249696, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037726593017578, "data/tokens_consumed": 9711910912, "data/tokens_consumed_B": 9.711910912, "train/loss_slope": -1.3196488496887449e-05} {"step": 4640, "timestamp": 1778330732.2782466, "train/loss": 2.3804914712905885, "train/z_loss": 0.001352431031409651, "train/perplexity": 10.810214468630171, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021645.1232597511, "perf/iters_per_sec": 0.9639955154703861, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373492240905762, "data/tokens_consumed": 9732882432, "data/tokens_consumed_B": 9.732882432, "train/loss_slope": -9.857697877446172e-06} {"step": 4650, "timestamp": 1778330742.650138, "grad/layer_0/attn": 0.0028432493563741446, "grad/layer_0/mlp": 0.003129850374534726, "grad/layer_0/attn_mlp_ratio": 0.9084297730858255, "grad/layer_4/attn": 0.001808023895137012, "grad/layer_4/mlp": 0.0026436957996338606, "grad/layer_4/attn_mlp_ratio": 0.6839001018942514, "grad/layer_8/attn": 0.003613630775362253, "grad/layer_8/mlp": 0.003481693798676133, "grad/layer_8/attn_mlp_ratio": 1.0378944503812604, "grad/layer_12/attn": 0.005786519031971693, "grad/layer_12/mlp": 0.006627498660236597, "grad/layer_12/attn_mlp_ratio": 0.873107523865477, "grad/layer_16/attn": 0.0033791696187108755, "grad/layer_16/mlp": 0.0045221406035125256, "grad/layer_16/attn_mlp_ratio": 0.7472499951375111, "grad/layer_20/attn": 0.002852126257494092, "grad/layer_20/mlp": 0.00598894851282239, "grad/layer_20/attn_mlp_ratio": 0.4762315461160688, "grad/layer_24/attn": 0.014220085926353931, "grad/layer_24/mlp": 0.011736495420336723, "grad/layer_24/attn_mlp_ratio": 1.2116126063111174, "grad/layer_27/attn": 0.010631517507135868, "grad/layer_27/mlp": 0.012073580175638199, "grad/layer_27/attn_mlp_ratio": 0.8805604687606963} {"step": 4650, "timestamp": 1778330743.2521791, "eos/sharpness": 67.55471229553221, "eos/L0_probe": 2.3512210845947266, "eos/L_plus": 2.642160415649414, "eos/L_minus": 2.7358288764953613, "eos/grad_norm": 0.20466038584709167, "eos/embed_grad_frac": 0.055972062051296234, "eos/time_s": 0.599165678024292} {"step": 4650, "timestamp": 1778330743.2711809, "train/loss": 2.382859945297241, "train/z_loss": 0.0013400081894360482, "train/perplexity": 10.835848525420847, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908892.4567726478, "perf/iters_per_sec": 0.910230854402851, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0986223936080932, "data/tokens_consumed": 9753853952, "data/tokens_consumed_B": 9.753853952, "train/loss_slope": -8.404332926922194e-06} {"step": 4650, "timestamp": 1778330744.6348581, "geo/rankme_last": 426.517578125, "geo/layer_0/stable_rank_q_proj": 20.67180061340332, "geo/layer_0/stable_rank_k_proj": 17.20919418334961, "geo/layer_0/stable_rank_o_proj": 45.610843658447266, "geo/layer_0/stable_rank_gate_proj": 129.86126708984375, "geo/layer_0/stable_rank_down_proj": 56.5019645690918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0655154213309288, "geo/layer_0/attn_entropy_mean": 6.254649639129639, "geo/layer_0/attn_entropy_std": 0.42280152440071106, "geo/layer_7/stable_rank_q_proj": 42.495704650878906, "geo/layer_7/stable_rank_k_proj": 39.434600830078125, "geo/layer_7/stable_rank_o_proj": 90.00453186035156, "geo/layer_7/stable_rank_gate_proj": 79.4066390991211, "geo/layer_7/stable_rank_down_proj": 142.62046813964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41149017214775085, "geo/layer_7/attn_entropy_mean": 4.749872207641602, "geo/layer_7/attn_entropy_std": 0.7618151903152466, "geo/layer_14/stable_rank_q_proj": 51.78896713256836, "geo/layer_14/stable_rank_k_proj": 41.53763961791992, "geo/layer_14/stable_rank_o_proj": 42.7122917175293, "geo/layer_14/stable_rank_gate_proj": 71.92530059814453, "geo/layer_14/stable_rank_down_proj": 127.61473846435547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3560219705104828, "geo/layer_14/attn_entropy_mean": 5.5210676193237305, "geo/layer_14/attn_entropy_std": 0.4399310350418091, "geo/layer_21/stable_rank_q_proj": 39.60419845581055, "geo/layer_21/stable_rank_k_proj": 29.224498748779297, "geo/layer_21/stable_rank_o_proj": 66.3292465209961, "geo/layer_21/stable_rank_gate_proj": 62.54734420776367, "geo/layer_21/stable_rank_down_proj": 49.73764419555664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13334228098392487, "geo/layer_21/attn_entropy_mean": 5.865933418273926, "geo/layer_21/attn_entropy_std": 0.3197225034236908, "geo/layer_27/stable_rank_q_proj": 43.55727767944336, "geo/layer_27/stable_rank_k_proj": 30.748456954956055, "geo/layer_27/stable_rank_o_proj": 109.8766860961914, "geo/layer_27/stable_rank_gate_proj": 73.49678802490234, "geo/layer_27/stable_rank_down_proj": 127.25175476074219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10759781301021576, "geo/layer_27/attn_entropy_mean": 4.314035415649414, "geo/layer_27/attn_entropy_std": 0.6636248826980591, "attnres/final_alpha/block_0": 0.25367075204849243, "attnres/block_norm/0": 1.7779500484466553, "attnres/final_alpha/block_1": 0.004088064655661583, "attnres/block_norm/1": 49467.07421875, "attnres/final_alpha/block_2": 0.008767873980104923, "attnres/block_norm/2": 29796.28125, "attnres/final_alpha/block_3": 0.010696000419557095, "attnres/block_norm/3": 69043.1796875, "attnres/final_alpha/block_4": 0.01238284818828106, "attnres/block_norm/4": 16742.8046875, "attnres/final_alpha/block_5": 0.607054591178894, "attnres/block_norm/5": 7040.9599609375, "attnres/final_alpha/block_6": 0.10333988070487976, "attnres/block_norm/6": 46103.109375, "geo/tier1_time_s": 1.3599693775177002, "geo/step": 4650.0, "geo/rankme_slope": 0.0014574058138880552} {"step": 4660, "timestamp": 1778330755.0129704, "train/loss": 2.377127432823181, "train/z_loss": 0.0013431594357825815, "train/perplexity": 10.773909591056906, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786621.882337772, "perf/iters_per_sec": 0.8519277011574612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1738085269927978, "data/tokens_consumed": 9774825472, "data/tokens_consumed_B": 9.774825472, "train/loss_slope": -8.219256845995612e-06} {"step": 4670, "timestamp": 1778330765.3954031, "train/loss": 2.4127106428146363, "train/z_loss": 0.0013294260483235122, "train/perplexity": 11.164182277685251, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021136.3735645511, "perf/iters_per_sec": 0.9637529247114902, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376103401184082, "data/tokens_consumed": 9795796992, "data/tokens_consumed_B": 9.795796992, "train/loss_slope": -5.108465176962057e-06} {"step": 4680, "timestamp": 1778330775.777937, "train/loss": 2.393181657791138, "train/z_loss": 0.0013552888878621161, "train/perplexity": 10.948272243060764, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021243.565042544, "perf/iters_per_sec": 0.9638040375912399, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375553131103517, "data/tokens_consumed": 9816768512, "data/tokens_consumed_B": 9.816768512, "train/loss_slope": -3.348777017327295e-06} {"step": 4690, "timestamp": 1778330786.1552656, "train/loss": 2.3595487594604494, "train/z_loss": 0.001351795857772231, "train/perplexity": 10.58617346387817, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021861.019765712, "perf/iters_per_sec": 0.9640984629467545, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372384548187257, "data/tokens_consumed": 9837740032, "data/tokens_consumed_B": 9.837740032, "train/loss_slope": -2.352543618276314e-06} {"step": 4700, "timestamp": 1778330796.5293674, "grad/layer_0/attn": 0.00292610889300704, "grad/layer_0/mlp": 0.003263782011345029, "grad/layer_0/attn_mlp_ratio": 0.8965392888317439, "grad/layer_4/attn": 0.002111897338181734, "grad/layer_4/mlp": 0.0026545098517090082, "grad/layer_4/attn_mlp_ratio": 0.7955884048662417, "grad/layer_8/attn": 0.005175344180315733, "grad/layer_8/mlp": 0.0034589115530252457, "grad/layer_8/attn_mlp_ratio": 1.4962348563569858, "grad/layer_12/attn": 0.00852159969508648, "grad/layer_12/mlp": 0.0070656947791576385, "grad/layer_12/attn_mlp_ratio": 1.2060526021613902, "grad/layer_16/attn": 0.005310583394020796, "grad/layer_16/mlp": 0.004219629801809788, "grad/layer_16/attn_mlp_ratio": 1.2585424593145247, "grad/layer_20/attn": 0.0030260367784649134, "grad/layer_20/mlp": 0.005660774186253548, "grad/layer_20/attn_mlp_ratio": 0.5345623452631294, "grad/layer_24/attn": 0.007315251976251602, "grad/layer_24/mlp": 0.0088771628215909, "grad/layer_24/attn_mlp_ratio": 0.8240529142998556, "grad/layer_27/attn": 0.012416284531354904, "grad/layer_27/mlp": 0.008212645538151264, "grad/layer_27/attn_mlp_ratio": 1.511849540137947} {"step": 4700, "timestamp": 1778330796.5449276, "train/loss": 2.3251389503479003, "train/z_loss": 0.0013575992663390935, "train/perplexity": 10.228101185275632, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019462.8198942442, "perf/iters_per_sec": 0.9629549121352406, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0384702205657959, "data/tokens_consumed": 9858711552, "data/tokens_consumed_B": 9.858711552, "train/loss_slope": -4.087500467766814e-06} {"step": 4710, "timestamp": 1778330806.9287143, "train/loss": 2.328816628456116, "train/z_loss": 0.0013471089070662857, "train/perplexity": 10.265786103116984, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020994.3205761674, "perf/iters_per_sec": 0.9636851885681951, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376832723617553, "data/tokens_consumed": 9879683072, "data/tokens_consumed_B": 9.879683072, "train/loss_slope": -6.8263082936330255e-06} {"step": 4720, "timestamp": 1778330817.310797, "train/loss": 2.3590232372283935, "train/z_loss": 0.0013473390601575375, "train/perplexity": 10.580611655925368, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020932.9825199877, "perf/iters_per_sec": 0.9636559403037966, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377147674560547, "data/tokens_consumed": 9900654592, "data/tokens_consumed_B": 9.900654592, "train/loss_slope": -8.777368315482989e-06} {"step": 4725, "timestamp": 1778330823.0898988, "eos/sharpness": 8.236002922058104, "eos/L0_probe": 2.3557541370391846, "eos/L_plus": 2.3940212726593018, "eos/L_minus": 2.3998470306396484, "eos/grad_norm": 0.09546378999948502, "eos/embed_grad_frac": 0.28336864709854126, "eos/time_s": 0.5976841449737549} {"step": 4725, "timestamp": 1778330824.4665482, "geo/rankme_last": 426.9776611328125, "geo/layer_0/stable_rank_q_proj": 20.697612762451172, "geo/layer_0/stable_rank_k_proj": 17.254301071166992, "geo/layer_0/stable_rank_o_proj": 45.620582580566406, "geo/layer_0/stable_rank_gate_proj": 129.7406768798828, "geo/layer_0/stable_rank_down_proj": 56.519012451171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06553276628255844, "geo/layer_0/attn_entropy_mean": 6.254730224609375, "geo/layer_0/attn_entropy_std": 0.4176521599292755, "geo/layer_7/stable_rank_q_proj": 42.51858139038086, "geo/layer_7/stable_rank_k_proj": 39.48697280883789, "geo/layer_7/stable_rank_o_proj": 90.07666778564453, "geo/layer_7/stable_rank_gate_proj": 79.50416564941406, "geo/layer_7/stable_rank_down_proj": 142.83828735351562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4063226282596588, "geo/layer_7/attn_entropy_mean": 4.728448867797852, "geo/layer_7/attn_entropy_std": 0.7768519520759583, "geo/layer_14/stable_rank_q_proj": 51.81475830078125, "geo/layer_14/stable_rank_k_proj": 41.584327697753906, "geo/layer_14/stable_rank_o_proj": 42.740291595458984, "geo/layer_14/stable_rank_gate_proj": 71.94651794433594, "geo/layer_14/stable_rank_down_proj": 128.02284240722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37215104699134827, "geo/layer_14/attn_entropy_mean": 5.546879291534424, "geo/layer_14/attn_entropy_std": 0.43564581871032715, "geo/layer_21/stable_rank_q_proj": 39.59531784057617, "geo/layer_21/stable_rank_k_proj": 29.259117126464844, "geo/layer_21/stable_rank_o_proj": 66.27397918701172, "geo/layer_21/stable_rank_gate_proj": 62.44327163696289, "geo/layer_21/stable_rank_down_proj": 49.70877456665039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1377200037240982, "geo/layer_21/attn_entropy_mean": 5.854999542236328, "geo/layer_21/attn_entropy_std": 0.31010183691978455, "geo/layer_27/stable_rank_q_proj": 43.51340866088867, "geo/layer_27/stable_rank_k_proj": 30.785003662109375, "geo/layer_27/stable_rank_o_proj": 109.5562744140625, "geo/layer_27/stable_rank_gate_proj": 73.489013671875, "geo/layer_27/stable_rank_down_proj": 127.4782485961914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10214189440011978, "geo/layer_27/attn_entropy_mean": 4.327300071716309, "geo/layer_27/attn_entropy_std": 0.6568019390106201, "attnres/final_alpha/block_0": 0.25236645340919495, "attnres/block_norm/0": 1.7779353857040405, "attnres/final_alpha/block_1": 0.004042790271341801, "attnres/block_norm/1": 49642.3359375, "attnres/final_alpha/block_2": 0.00868698675185442, "attnres/block_norm/2": 29714.619140625, "attnres/final_alpha/block_3": 0.010599251836538315, "attnres/block_norm/3": 68422.90625, "attnres/final_alpha/block_4": 0.01203722320497036, "attnres/block_norm/4": 16760.3203125, "attnres/final_alpha/block_5": 0.6094452142715454, "attnres/block_norm/5": 7002.6240234375, "attnres/final_alpha/block_6": 0.10282206535339355, "attnres/block_norm/6": 45834.87109375, "geo/tier1_time_s": 1.3556156158447266, "geo/step": 4725.0, "geo/rankme_slope": 0.0014463982077205883} {"step": 4730, "timestamp": 1778330829.6567376, "train/loss": 2.3428569555282595, "train/z_loss": 0.0013375061098486185, "train/perplexity": 10.410937701133482, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699637.2430111805, "perf/iters_per_sec": 0.8104501929336455, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2338821172714234, "data/tokens_consumed": 9921626112, "data/tokens_consumed_B": 9.921626112, "train/loss_slope": -1.1232763262364441e-05} {"step": 4740, "timestamp": 1778330840.0330412, "train/loss": 2.3903566122055055, "train/z_loss": 0.0013440325274132192, "train/perplexity": 10.917386522216143, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022014.396564506, "perf/iters_per_sec": 0.9641715987036257, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371597766876222, "data/tokens_consumed": 9942597632, "data/tokens_consumed_B": 9.942597632, "train/loss_slope": -1.0201733268991851e-05} {"step": 4750, "timestamp": 1778330850.4013784, "grad/layer_0/attn": 0.0031930352561175823, "grad/layer_0/mlp": 0.0036034577060490847, "grad/layer_0/attn_mlp_ratio": 0.8861031342610616, "grad/layer_4/attn": 0.0018621283816173673, "grad/layer_4/mlp": 0.002740641823038459, "grad/layer_4/attn_mlp_ratio": 0.6794497179525325, "grad/layer_8/attn": 0.0029362484347075224, "grad/layer_8/mlp": 0.003615587716922164, "grad/layer_8/attn_mlp_ratio": 0.8121081780851497, "grad/layer_12/attn": 0.0061284760013222694, "grad/layer_12/mlp": 0.007094682194292545, "grad/layer_12/attn_mlp_ratio": 0.8638126059925814, "grad/layer_16/attn": 0.003985704388469458, "grad/layer_16/mlp": 0.004478637594729662, "grad/layer_16/attn_mlp_ratio": 0.8899367754528851, "grad/layer_20/attn": 0.00478400569409132, "grad/layer_20/mlp": 0.00610910402610898, "grad/layer_20/attn_mlp_ratio": 0.7830944759388733, "grad/layer_24/attn": 0.0054773506708443165, "grad/layer_24/mlp": 0.007895269431173801, "grad/layer_24/attn_mlp_ratio": 0.693750941524854, "grad/layer_27/attn": 0.009194931946694851, "grad/layer_27/mlp": 0.00726911798119545, "grad/layer_27/attn_mlp_ratio": 1.2649308821219052} {"step": 4750, "timestamp": 1778330850.4170299, "train/loss": 2.346107006072998, "train/z_loss": 0.0013493626145645976, "train/perplexity": 10.44482881897112, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020468.9584294057, "perf/iters_per_sec": 0.9634346763751057, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379530906677246, "data/tokens_consumed": 9963569152, "data/tokens_consumed_B": 9.963569152, "train/loss_slope": -1.2918893675027278e-05} {"step": 4760, "timestamp": 1778330860.796669, "train/loss": 2.3580590009689333, "train/z_loss": 0.001344639656599611, "train/perplexity": 10.570414363609038, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021676.3942629034, "perf/iters_per_sec": 0.9640104266466634, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373331785202027, "data/tokens_consumed": 9984540672, "data/tokens_consumed_B": 9.984540672, "train/loss_slope": -1.1787746863217802e-05} {"step": 4770, "timestamp": 1778330871.1737788, "train/loss": 2.3412325620651244, "train/z_loss": 0.001353108766488731, "train/perplexity": 10.394039969984272, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022342.7011094627, "perf/iters_per_sec": 0.964328146509868, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369914054870606, "data/tokens_consumed": 10005512192, "data/tokens_consumed_B": 10.005512192, "train/loss_slope": -1.380960834730932e-05} {"step": 4780, "timestamp": 1778330881.5575495, "train/loss": 2.3635746717453, "train/z_loss": 0.001347937365062535, "train/perplexity": 10.628878375109355, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020614.0475039594, "perf/iters_per_sec": 0.9635038602371022, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378785610198975, "data/tokens_consumed": 10026483712, "data/tokens_consumed_B": 10.026483712, "train/loss_slope": -1.436985552650722e-05} {"step": 4790, "timestamp": 1778330891.9355428, "train/loss": 2.381602907180786, "train/z_loss": 0.0013571086921729148, "train/perplexity": 10.822236008319166, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021973.9121248305, "perf/iters_per_sec": 0.9641522942184594, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371805429458618, "data/tokens_consumed": 10047455232, "data/tokens_consumed_B": 10.047455232, "train/loss_slope": -1.2179321431555673e-05} {"step": 4800, "timestamp": 1778330902.3041995, "grad/layer_0/attn": 0.002755251480266452, "grad/layer_0/mlp": 0.0030717782210558653, "grad/layer_0/attn_mlp_ratio": 0.8969564832788411, "grad/layer_4/attn": 0.0020274287089705467, "grad/layer_4/mlp": 0.0025687734596431255, "grad/layer_4/attn_mlp_ratio": 0.7892594118931267, "grad/layer_8/attn": 0.004106312524527311, "grad/layer_8/mlp": 0.0033684226218611, "grad/layer_8/attn_mlp_ratio": 1.2190609266103385, "grad/layer_12/attn": 0.008465616032481194, "grad/layer_12/mlp": 0.006764654535800219, "grad/layer_12/attn_mlp_ratio": 1.251448372202044, "grad/layer_16/attn": 0.003760125720873475, "grad/layer_16/mlp": 0.004323005210608244, "grad/layer_16/attn_mlp_ratio": 0.8697943793051756, "grad/layer_20/attn": 0.0038795815780758858, "grad/layer_20/mlp": 0.005355683155357838, "grad/layer_20/attn_mlp_ratio": 0.7243859267059425, "grad/layer_24/attn": 0.008585228584706783, "grad/layer_24/mlp": 0.008505337871611118, "grad/layer_24/attn_mlp_ratio": 1.0093929968876396, "grad/layer_27/attn": 0.006626148242503405, "grad/layer_27/mlp": 0.007734449580311775, "grad/layer_27/attn_mlp_ratio": 0.8567058441624389} {"step": 4800, "timestamp": 1778330902.89739, "eos/sharpness": 36.38699054718017, "eos/L0_probe": 2.3538222312927246, "eos/L_plus": 2.564699649810791, "eos/L_minus": 2.50681471824646, "eos/grad_norm": 0.12522579729557037, "eos/embed_grad_frac": 0.1506856381893158, "eos/time_s": 0.5904660224914551} {"step": 4800, "timestamp": 1778330902.9165738, "train/loss": 2.375101590156555, "train/z_loss": 0.0013417441048659384, "train/perplexity": 10.75210543866917, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910564.92179256, "perf/iters_per_sec": 0.9110283478701401, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0976606845855712, "data/tokens_consumed": 10068426752, "data/tokens_consumed_B": 10.068426752, "train/loss_slope": -1.1728718571930432e-05} {"step": 4800, "timestamp": 1778330904.280539, "geo/rankme_last": 426.7742919921875, "geo/layer_0/stable_rank_q_proj": 20.687807083129883, "geo/layer_0/stable_rank_k_proj": 17.242401123046875, "geo/layer_0/stable_rank_o_proj": 45.56449890136719, "geo/layer_0/stable_rank_gate_proj": 129.79330444335938, "geo/layer_0/stable_rank_down_proj": 56.564857482910156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06327321380376816, "geo/layer_0/attn_entropy_mean": 6.247496604919434, "geo/layer_0/attn_entropy_std": 0.42516666650772095, "geo/layer_7/stable_rank_q_proj": 42.564697265625, "geo/layer_7/stable_rank_k_proj": 39.53318786621094, "geo/layer_7/stable_rank_o_proj": 90.13721466064453, "geo/layer_7/stable_rank_gate_proj": 79.59343719482422, "geo/layer_7/stable_rank_down_proj": 142.7665252685547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.418340802192688, "geo/layer_7/attn_entropy_mean": 4.744450569152832, "geo/layer_7/attn_entropy_std": 0.7742258310317993, "geo/layer_14/stable_rank_q_proj": 51.840763092041016, "geo/layer_14/stable_rank_k_proj": 41.64136505126953, "geo/layer_14/stable_rank_o_proj": 42.705745697021484, "geo/layer_14/stable_rank_gate_proj": 71.99214935302734, "geo/layer_14/stable_rank_down_proj": 127.83482360839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3788193464279175, "geo/layer_14/attn_entropy_mean": 5.504075527191162, "geo/layer_14/attn_entropy_std": 0.4484582245349884, "geo/layer_21/stable_rank_q_proj": 39.59480667114258, "geo/layer_21/stable_rank_k_proj": 29.133819580078125, "geo/layer_21/stable_rank_o_proj": 66.3444595336914, "geo/layer_21/stable_rank_gate_proj": 62.49088668823242, "geo/layer_21/stable_rank_down_proj": 49.70376968383789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14092764258384705, "geo/layer_21/attn_entropy_mean": 5.867876052856445, "geo/layer_21/attn_entropy_std": 0.3252081871032715, "geo/layer_27/stable_rank_q_proj": 43.47134780883789, "geo/layer_27/stable_rank_k_proj": 30.704620361328125, "geo/layer_27/stable_rank_o_proj": 109.44603729248047, "geo/layer_27/stable_rank_gate_proj": 73.43118286132812, "geo/layer_27/stable_rank_down_proj": 127.73640441894531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10867095738649368, "geo/layer_27/attn_entropy_mean": 4.293252944946289, "geo/layer_27/attn_entropy_std": 0.6498997211456299, "attnres/final_alpha/block_0": 0.2513568699359894, "attnres/block_norm/0": 1.777951955795288, "attnres/final_alpha/block_1": 0.004046626389026642, "attnres/block_norm/1": 49736.37109375, "attnres/final_alpha/block_2": 0.008619258180260658, "attnres/block_norm/2": 29713.453125, "attnres/final_alpha/block_3": 0.010502252727746964, "attnres/block_norm/3": 68855.9453125, "attnres/final_alpha/block_4": 0.011949775740504265, "attnres/block_norm/4": 16782.0390625, "attnres/final_alpha/block_5": 0.6094921827316284, "attnres/block_norm/5": 7070.77734375, "attnres/final_alpha/block_6": 0.10403303056955338, "attnres/block_norm/6": 45815.3359375, "geo/tier1_time_s": 1.359926700592041, "geo/step": 4800.0, "geo/rankme_slope": 0.001424266054077881} {"step": 4810, "timestamp": 1778330915.6582644, "train/loss": 2.376316475868225, "train/z_loss": 0.0013656834256835283, "train/perplexity": 10.765175955921688, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1646406.9022816739, "perf/iters_per_sec": 0.7850679885300035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2737750291824341, "data/tokens_consumed": 10089398272, "data/tokens_consumed_B": 10.089398272, "train/loss_slope": -8.666223139152004e-06} {"step": 4820, "timestamp": 1778330926.046671, "train/loss": 2.3872600555419923, "train/z_loss": 0.001350509945768863, "train/perplexity": 10.883632503819241, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020216.1002510842, "perf/iters_per_sec": 0.9633141041999265, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038083004951477, "data/tokens_consumed": 10110369792, "data/tokens_consumed_B": 10.110369792, "train/loss_slope": -7.9688932171034e-06} {"step": 4830, "timestamp": 1778330936.4286304, "train/loss": 2.3644086837768556, "train/z_loss": 0.0013555715209804474, "train/perplexity": 10.637746685180739, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020965.90308799, "perf/iters_per_sec": 0.9636716380538892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376978635787963, "data/tokens_consumed": 10131341312, "data/tokens_consumed_B": 10.131341312, "train/loss_slope": -1.0966062831907286e-05} {"step": 4840, "timestamp": 1778330946.8172045, "train/loss": 2.3629801750183104, "train/z_loss": 0.0013642636593431234, "train/perplexity": 10.622561419594518, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020134.0706747775, "perf/iters_per_sec": 0.9632749894498718, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381251573562622, "data/tokens_consumed": 10152312832, "data/tokens_consumed_B": 10.152312832, "train/loss_slope": -1.241045460270365e-05} {"step": 4850, "timestamp": 1778330957.190624, "grad/layer_0/attn": 0.003661227645352483, "grad/layer_0/mlp": 0.0038623639848083258, "grad/layer_0/attn_mlp_ratio": 0.9479240084468048, "grad/layer_4/attn": 0.002527375938370824, "grad/layer_4/mlp": 0.0027293250896036625, "grad/layer_4/attn_mlp_ratio": 0.9260076256203957, "grad/layer_8/attn": 0.0034299802500754595, "grad/layer_8/mlp": 0.0037117013707756996, "grad/layer_8/attn_mlp_ratio": 0.9240991704428848, "grad/layer_12/attn": 0.005913134198635817, "grad/layer_12/mlp": 0.007658606860786676, "grad/layer_12/attn_mlp_ratio": 0.7720900457370424, "grad/layer_16/attn": 0.004238564986735582, "grad/layer_16/mlp": 0.005742733366787434, "grad/layer_16/attn_mlp_ratio": 0.7380744746816007, "grad/layer_20/attn": 0.003941545728594065, "grad/layer_20/mlp": 0.007912806235253811, "grad/layer_20/attn_mlp_ratio": 0.49812235528037047, "grad/layer_24/attn": 0.02167205512523651, "grad/layer_24/mlp": 0.01622978411614895, "grad/layer_24/attn_mlp_ratio": 1.3353261409151944, "grad/layer_27/attn": 0.0078811626881361, "grad/layer_27/mlp": 0.016926119104027748, "grad/layer_27/attn_mlp_ratio": 0.4656213626488401} {"step": 4850, "timestamp": 1778330957.2060864, "train/loss": 2.356445789337158, "train/z_loss": 0.001358537352643907, "train/perplexity": 10.553375795308437, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019968.7326587024, "perf/iters_per_sec": 0.9631961501401436, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038210129737854, "data/tokens_consumed": 10173284352, "data/tokens_consumed_B": 10.173284352, "train/loss_slope": -1.244721820395239e-05} {"step": 4860, "timestamp": 1778330967.581953, "train/loss": 2.415091133117676, "train/z_loss": 0.0013442018534988165, "train/perplexity": 11.19079016267977, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022104.8068201132, "perf/iters_per_sec": 0.9642147096729818, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037113404273987, "data/tokens_consumed": 10194255872, "data/tokens_consumed_B": 10.194255872, "train/loss_slope": -9.81492279457898e-06} {"step": 4870, "timestamp": 1778330977.9774778, "train/loss": 2.3680227756500245, "train/z_loss": 0.0013510373537428677, "train/perplexity": 10.676262036110012, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020359.4824195097, "perf/iters_per_sec": 0.9633824741456555, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380093336105347, "data/tokens_consumed": 10215227392, "data/tokens_consumed_B": 10.215227392, "train/loss_slope": -9.117549194647254e-06} {"step": 4875, "timestamp": 1778330983.7435977, "eos/sharpness": 51.51479244232177, "eos/L0_probe": 2.351132392883301, "eos/L_plus": 2.651033639907837, "eos/L_minus": 2.5663790702819824, "eos/grad_norm": 0.16365200281143188, "eos/embed_grad_frac": 0.09161081910133362, "eos/time_s": 0.5893604755401611} {"step": 4875, "timestamp": 1778330985.118374, "geo/rankme_last": 427.2397766113281, "geo/layer_0/stable_rank_q_proj": 20.69771385192871, "geo/layer_0/stable_rank_k_proj": 17.215314865112305, "geo/layer_0/stable_rank_o_proj": 45.53461456298828, "geo/layer_0/stable_rank_gate_proj": 129.7648162841797, "geo/layer_0/stable_rank_down_proj": 56.468963623046875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06605061888694763, "geo/layer_0/attn_entropy_mean": 6.252255439758301, "geo/layer_0/attn_entropy_std": 0.42649948596954346, "geo/layer_7/stable_rank_q_proj": 42.484378814697266, "geo/layer_7/stable_rank_k_proj": 39.40910339355469, "geo/layer_7/stable_rank_o_proj": 90.13545227050781, "geo/layer_7/stable_rank_gate_proj": 79.44412994384766, "geo/layer_7/stable_rank_down_proj": 142.96981811523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4208787679672241, "geo/layer_7/attn_entropy_mean": 4.719141006469727, "geo/layer_7/attn_entropy_std": 0.7601866722106934, "geo/layer_14/stable_rank_q_proj": 51.83879470825195, "geo/layer_14/stable_rank_k_proj": 41.659332275390625, "geo/layer_14/stable_rank_o_proj": 42.64857864379883, "geo/layer_14/stable_rank_gate_proj": 72.04513549804688, "geo/layer_14/stable_rank_down_proj": 127.53221893310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3759433925151825, "geo/layer_14/attn_entropy_mean": 5.519940376281738, "geo/layer_14/attn_entropy_std": 0.4386785328388214, "geo/layer_21/stable_rank_q_proj": 39.549842834472656, "geo/layer_21/stable_rank_k_proj": 29.12789535522461, "geo/layer_21/stable_rank_o_proj": 66.36219787597656, "geo/layer_21/stable_rank_gate_proj": 62.41746139526367, "geo/layer_21/stable_rank_down_proj": 49.68741989135742, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13518790900707245, "geo/layer_21/attn_entropy_mean": 5.88082218170166, "geo/layer_21/attn_entropy_std": 0.32108256220817566, "geo/layer_27/stable_rank_q_proj": 43.43391036987305, "geo/layer_27/stable_rank_k_proj": 30.699230194091797, "geo/layer_27/stable_rank_o_proj": 109.2016830444336, "geo/layer_27/stable_rank_gate_proj": 73.5017318725586, "geo/layer_27/stable_rank_down_proj": 127.57489013671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1078910306096077, "geo/layer_27/attn_entropy_mean": 4.314493656158447, "geo/layer_27/attn_entropy_std": 0.631510317325592, "attnres/final_alpha/block_0": 0.2506161332130432, "attnres/block_norm/0": 1.7777998447418213, "attnres/final_alpha/block_1": 0.004030454438179731, "attnres/block_norm/1": 49518.7421875, "attnres/final_alpha/block_2": 0.008598508313298225, "attnres/block_norm/2": 29801.40234375, "attnres/final_alpha/block_3": 0.010568584315478802, "attnres/block_norm/3": 69422.9765625, "attnres/final_alpha/block_4": 0.01199557725340128, "attnres/block_norm/4": 16802.646484375, "attnres/final_alpha/block_5": 0.6129108667373657, "attnres/block_norm/5": 7021.3974609375, "attnres/final_alpha/block_6": 0.1012798547744751, "attnres/block_norm/6": 46132.8671875, "geo/tier1_time_s": 1.355677604675293, "geo/step": 4875.0, "geo/rankme_slope": 0.0013936153367597038} {"step": 4880, "timestamp": 1778330990.3078227, "train/loss": 2.3634002923965456, "train/z_loss": 0.0013325309730134905, "train/perplexity": 10.627025079813222, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701729.4197984335, "perf/iters_per_sec": 0.8114478205673378, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2323651313781738, "data/tokens_consumed": 10236198912, "data/tokens_consumed_B": 10.236198912, "train/loss_slope": -8.421094475513532e-06} {"step": 4890, "timestamp": 1778331000.684788, "train/loss": 2.3461528539657595, "train/z_loss": 0.0013546413858421147, "train/perplexity": 10.445307703340559, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021864.0405992258, "perf/iters_per_sec": 0.9640999033924226, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372369050979615, "data/tokens_consumed": 10257170432, "data/tokens_consumed_B": 10.257170432, "train/loss_slope": -1.139479991089929e-05} {"step": 4900, "timestamp": 1778331011.0556636, "grad/layer_0/attn": 0.005079647526144981, "grad/layer_0/mlp": 0.005070625804364681, "grad/layer_0/attn_mlp_ratio": 1.0017791929340587, "grad/layer_4/attn": 0.005919570568948984, "grad/layer_4/mlp": 0.003525056876242161, "grad/layer_4/attn_mlp_ratio": 1.6792836566458742, "grad/layer_8/attn": 0.007899472489953041, "grad/layer_8/mlp": 0.003952063154429197, "grad/layer_8/attn_mlp_ratio": 1.9988223824858733, "grad/layer_12/attn": 0.007992863655090332, "grad/layer_12/mlp": 0.00842686090618372, "grad/layer_12/attn_mlp_ratio": 0.9484983375452716, "grad/layer_16/attn": 0.005704378709197044, "grad/layer_16/mlp": 0.005728031974285841, "grad/layer_16/attn_mlp_ratio": 0.9958705948601474, "grad/layer_20/attn": 0.018248306587338448, "grad/layer_20/mlp": 0.007324965670704842, "grad/layer_20/attn_mlp_ratio": 2.4912480356317777, "grad/layer_24/attn": 0.008413841016590595, "grad/layer_24/mlp": 0.011241055093705654, "grad/layer_24/attn_mlp_ratio": 0.7484921007506362, "grad/layer_27/attn": 0.005551094189286232, "grad/layer_27/mlp": 0.00928565114736557, "grad/layer_27/attn_mlp_ratio": 0.5978142018699154} {"step": 4900, "timestamp": 1778331011.0712035, "train/loss": 2.3796884059906005, "train/z_loss": 0.0013611436588689685, "train/perplexity": 10.801536645400715, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020335.67678646, "perf/iters_per_sec": 0.9633711227352428, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380215644836426, "data/tokens_consumed": 10278141952, "data/tokens_consumed_B": 10.278141952, "train/loss_slope": -1.119994636487575e-05} {"step": 4910, "timestamp": 1778331021.4498744, "train/loss": 2.3675546407699586, "train/z_loss": 0.0013474601437337697, "train/perplexity": 10.671265275132491, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021873.289053336, "perf/iters_per_sec": 0.9641043133989983, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372321605682373, "data/tokens_consumed": 10299113472, "data/tokens_consumed_B": 10.299113472, "train/loss_slope": -8.880480595476243e-06} {"step": 4920, "timestamp": 1778331031.8297515, "train/loss": 2.3607970476150513, "train/z_loss": 0.0013404622906818986, "train/perplexity": 10.599396310059808, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021358.8039179293, "perf/iters_per_sec": 0.9638589877690932, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374961614608764, "data/tokens_consumed": 10320084992, "data/tokens_consumed_B": 10.320084992, "train/loss_slope": -1.0221273191619704e-05} {"step": 4930, "timestamp": 1778331042.209694, "train/loss": 2.345916485786438, "train/z_loss": 0.0013552267802879215, "train/perplexity": 10.442839056742512, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021664.220295267, "perf/iters_per_sec": 0.9640046216465316, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373394250869752, "data/tokens_consumed": 10341056512, "data/tokens_consumed_B": 10.341056512, "train/loss_slope": -1.1163324126602112e-05} {"step": 4940, "timestamp": 1778331052.5892477, "train/loss": 2.379658651351929, "train/z_loss": 0.0013430540333501995, "train/perplexity": 10.801215254362193, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021316.9522465933, "perf/iters_per_sec": 0.9638390313370673, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375176429748536, "data/tokens_consumed": 10362028032, "data/tokens_consumed_B": 10.362028032, "train/loss_slope": -1.0482279631313953e-05} {"step": 4950, "timestamp": 1778331062.9522903, "grad/layer_0/attn": 0.00264913821592927, "grad/layer_0/mlp": 0.003044887911528349, "grad/layer_0/attn_mlp_ratio": 0.8700281277666968, "grad/layer_4/attn": 0.0026374515146017075, "grad/layer_4/mlp": 0.0025940376799553633, "grad/layer_4/attn_mlp_ratio": 1.0167359685282191, "grad/layer_8/attn": 0.004220211878418922, "grad/layer_8/mlp": 0.0036059864796698093, "grad/layer_8/attn_mlp_ratio": 1.1703348820575357, "grad/layer_12/attn": 0.0054274084977805614, "grad/layer_12/mlp": 0.0070551601238548756, "grad/layer_12/attn_mlp_ratio": 0.7692821035345778, "grad/layer_16/attn": 0.004630186129361391, "grad/layer_16/mlp": 0.00418085278943181, "grad/layer_16/attn_mlp_ratio": 1.1074740613489145, "grad/layer_20/attn": 0.003814429510384798, "grad/layer_20/mlp": 0.006268290337175131, "grad/layer_20/attn_mlp_ratio": 0.6085278830991453, "grad/layer_24/attn": 0.013470352627336979, "grad/layer_24/mlp": 0.011429262347519398, "grad/layer_24/attn_mlp_ratio": 1.1785845927670144, "grad/layer_27/attn": 0.014798068441450596, "grad/layer_27/mlp": 0.010823340155184269, "grad/layer_27/attn_mlp_ratio": 1.3672367395418872} {"step": 4950, "timestamp": 1778331063.5446265, "eos/sharpness": 66.9286012649536, "eos/L0_probe": 2.3478894233703613, "eos/L_plus": 2.629371166229248, "eos/L_minus": 2.7356936931610107, "eos/grad_norm": 0.21743282675743103, "eos/embed_grad_frac": 0.0465080700814724, "eos/time_s": 0.5895695686340332} {"step": 4950, "timestamp": 1778331063.5657303, "train/loss": 2.3337062120437624, "train/z_loss": 0.0013447299017570913, "train/perplexity": 10.31610443996711, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911537.812910063, "perf/iters_per_sec": 0.9114922585058512, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0971020221710206, "data/tokens_consumed": 10382999552, "data/tokens_consumed_B": 10.382999552, "train/loss_slope": -1.1897518511044512e-05} {"step": 4950, "timestamp": 1778331064.926653, "geo/rankme_last": 426.2264404296875, "geo/layer_0/stable_rank_q_proj": 20.672313690185547, "geo/layer_0/stable_rank_k_proj": 17.174739837646484, "geo/layer_0/stable_rank_o_proj": 45.51285171508789, "geo/layer_0/stable_rank_gate_proj": 129.632080078125, "geo/layer_0/stable_rank_down_proj": 56.4533576965332, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06386811286211014, "geo/layer_0/attn_entropy_mean": 6.242234230041504, "geo/layer_0/attn_entropy_std": 0.4319290518760681, "geo/layer_7/stable_rank_q_proj": 42.489253997802734, "geo/layer_7/stable_rank_k_proj": 39.394630432128906, "geo/layer_7/stable_rank_o_proj": 90.1033706665039, "geo/layer_7/stable_rank_gate_proj": 79.31881713867188, "geo/layer_7/stable_rank_down_proj": 143.21177673339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41364359855651855, "geo/layer_7/attn_entropy_mean": 4.736504554748535, "geo/layer_7/attn_entropy_std": 0.776334822177887, "geo/layer_14/stable_rank_q_proj": 51.73284912109375, "geo/layer_14/stable_rank_k_proj": 41.74523162841797, "geo/layer_14/stable_rank_o_proj": 42.633140563964844, "geo/layer_14/stable_rank_gate_proj": 72.03182220458984, "geo/layer_14/stable_rank_down_proj": 127.6769027709961, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3848242163658142, "geo/layer_14/attn_entropy_mean": 5.531711578369141, "geo/layer_14/attn_entropy_std": 0.43392106890678406, "geo/layer_21/stable_rank_q_proj": 39.52375030517578, "geo/layer_21/stable_rank_k_proj": 29.052568435668945, "geo/layer_21/stable_rank_o_proj": 66.38544464111328, "geo/layer_21/stable_rank_gate_proj": 62.39361572265625, "geo/layer_21/stable_rank_down_proj": 49.726234436035156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1380479633808136, "geo/layer_21/attn_entropy_mean": 5.875922203063965, "geo/layer_21/attn_entropy_std": 0.3129226565361023, "geo/layer_27/stable_rank_q_proj": 43.4974250793457, "geo/layer_27/stable_rank_k_proj": 30.662832260131836, "geo/layer_27/stable_rank_o_proj": 109.32535552978516, "geo/layer_27/stable_rank_gate_proj": 73.51853942871094, "geo/layer_27/stable_rank_down_proj": 127.6770248413086, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.102937251329422, "geo/layer_27/attn_entropy_mean": 4.300116539001465, "geo/layer_27/attn_entropy_std": 0.6362943053245544, "attnres/final_alpha/block_0": 0.251831591129303, "attnres/block_norm/0": 1.7779135704040527, "attnres/final_alpha/block_1": 0.004034962970763445, "attnres/block_norm/1": 49716.015625, "attnres/final_alpha/block_2": 0.008745728060603142, "attnres/block_norm/2": 29725.51171875, "attnres/final_alpha/block_3": 0.010499070398509502, "attnres/block_norm/3": 69896.0, "attnres/final_alpha/block_4": 0.012005537748336792, "attnres/block_norm/4": 16768.23828125, "attnres/final_alpha/block_5": 0.6095658540725708, "attnres/block_norm/5": 7040.7763671875, "attnres/final_alpha/block_6": 0.1033172458410263, "attnres/block_norm/6": 46008.8984375, "geo/tier1_time_s": 1.3566033840179443, "geo/step": 4950.0, "geo/rankme_slope": 0.001337031218737495} {"step": 4960, "timestamp": 1778331075.3012528, "train/loss": 2.349220371246338, "train/z_loss": 0.0013463226263411344, "train/perplexity": 10.477398058918373, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787641.9324026082, "perf/iters_per_sec": 0.8524140989316026, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1731387376785278, "data/tokens_consumed": 10403971072, "data/tokens_consumed_B": 10.403971072, "train/loss_slope": -1.260875366558872e-05} {"step": 4970, "timestamp": 1778331085.6854718, "train/loss": 2.4107226610183714, "train/z_loss": 0.0013616627547889948, "train/perplexity": 11.142010132759436, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020713.5704202272, "perf/iters_per_sec": 0.9635513164616715, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037827444076538, "data/tokens_consumed": 10424942592, "data/tokens_consumed_B": 10.424942592, "train/loss_slope": -1.0714297090033096e-05} {"step": 4980, "timestamp": 1778331096.0599968, "train/loss": 2.3615350246429445, "train/z_loss": 0.0013536958955228328, "train/perplexity": 10.607221308025409, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022432.2107155262, "perf/iters_per_sec": 0.9643708280160552, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369455099105835, "data/tokens_consumed": 10445914112, "data/tokens_consumed_B": 10.445914112, "train/loss_slope": -9.294055039697371e-06} {"step": 4990, "timestamp": 1778331106.4347878, "train/loss": 2.3213791608810426, "train/z_loss": 0.001338763500098139, "train/perplexity": 10.189717879962364, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022969.5743644352, "perf/iters_per_sec": 0.9646270629713226, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366700649261475, "data/tokens_consumed": 10466885632, "data/tokens_consumed_B": 10.466885632, "train/loss_slope": -1.0059041072755014e-05} {"step": 5000, "timestamp": 1778331116.7997065, "grad/layer_0/attn": 0.0037771952338516712, "grad/layer_0/mlp": 0.0036937554832547903, "grad/layer_0/attn_mlp_ratio": 1.0225893805684234, "grad/layer_4/attn": 0.002214839681982994, "grad/layer_4/mlp": 0.0026179503183811903, "grad/layer_4/attn_mlp_ratio": 0.8460204847395622, "grad/layer_8/attn": 0.0045262714847922325, "grad/layer_8/mlp": 0.0037407842464745045, "grad/layer_8/attn_mlp_ratio": 1.2099792625196395, "grad/layer_12/attn": 0.005408908240497112, "grad/layer_12/mlp": 0.007310494780540466, "grad/layer_12/attn_mlp_ratio": 0.7398826384373627, "grad/layer_16/attn": 0.006275364197790623, "grad/layer_16/mlp": 0.004555024206638336, "grad/layer_16/attn_mlp_ratio": 1.3776796292052982, "grad/layer_20/attn": 0.0038579925894737244, "grad/layer_20/mlp": 0.005875548347830772, "grad/layer_20/attn_mlp_ratio": 0.6566182925269008, "grad/layer_24/attn": 0.005155015271157026, "grad/layer_24/mlp": 0.007502452470362186, "grad/layer_24/attn_mlp_ratio": 0.687110677849732, "grad/layer_27/attn": 0.008977135643362999, "grad/layer_27/mlp": 0.006525006145238876, "grad/layer_27/attn_mlp_ratio": 1.375804912050985} {"step": 5000, "timestamp": 1778331116.8151484, "train/loss": 2.320221781730652, "train/z_loss": 0.0013579391641542315, "train/perplexity": 10.177931335006125, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021185.8344239371, "perf/iters_per_sec": 0.9637765094871221, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037584948539734, "data/tokens_consumed": 10487857152, "data/tokens_consumed_B": 10.487857152, "train/loss_slope": -1.2201588596626676e-05} {"step": 5000, "timestamp": 1778331123.8550193, "geo/ww_alpha_mean": 7.499242113038008, "geo/ww_alpha_std": 4.388676042104682, "geo/ww_alpha_min": 1.362845678416678, "geo/ww_alpha_max": 27.034988299936657, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.891988141115249, "geo/ww_alpha_by_type/k_proj": 4.552275760503137, "geo/ww_alpha_by_type/v_proj": 8.602049419284505, "geo/ww_alpha_by_type/o_proj": 8.782628337113438, "geo/ww_alpha_by_type/gate_proj": 7.886480859669341, "geo/ww_alpha_by_type/up_proj": 10.978488133609389, "geo/ww_alpha_by_type/down_proj": 7.8898211817926756, "geo/twonn_id/layer_0": 0.7107530236244202, "geo/twonn_id/layer_7": 3.3229851722717285, "geo/twonn_id/layer_14": 5.570657730102539, "geo/twonn_id/layer_21": 6.866250038146973, "geo/twonn_id/layer_27": 6.345386028289795, "geo/tier2_time_s": 7.0330119132995605} {"step": 5000, "timestamp": 1778331124.6326165, "eoc/jacobian_sigma/layer_0/attn": 1591.3271484375, "eoc/jacobian_sigma/layer_0/mlp": 10513.4501953125, "eoc/jacobian_sigma/layer_0": 10513.4501953125, "eoc/jacobian_sigma/layer_7/attn": 1.127973198890686, "eoc/jacobian_sigma/layer_7/mlp": 1.7679452896118164, "eoc/jacobian_sigma/layer_7": 1.7679452896118164, "eoc/jacobian_sigma/layer_14/attn": 1.9914573431015015, "eoc/jacobian_sigma/layer_14/mlp": 14.833356857299805, "eoc/jacobian_sigma/layer_14": 14.833356857299805, "eoc/jacobian_sigma/layer_21/attn": 1.080452561378479, "eoc/jacobian_sigma/layer_21/mlp": 4.8790717124938965, "eoc/jacobian_sigma/layer_21": 4.8790717124938965, "eoc/jacobian_sigma/layer_27/attn": 3.5367183685302734, "eoc/jacobian_sigma/layer_27/mlp": 29.975847244262695, "eoc/jacobian_sigma/layer_27": 29.975847244262695, "eoc/layer0_sigma": 10513.4501953125, "eoc/sigma_max": 29.975847244262695, "eoc/sigma_min": 1.7679452896118164, "eoc/sigma_mean": 12.864055275917053, "eoc/time_s": 0.7702555656433105} {"step": 5010, "timestamp": 1778331135.0281308, "train/loss": 2.33466956615448, "train/z_loss": 0.0013559683109633624, "train/perplexity": 10.326047290059744, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1151802.1194254793, "perf/iters_per_sec": 0.549222049439182, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8207571983337403, "data/tokens_consumed": 10508828672, "data/tokens_consumed_B": 10.508828672, "train/loss_slope": -1.3533459087409587e-05} {"step": 5020, "timestamp": 1778331145.4067664, "train/loss": 2.3634141206741335, "train/z_loss": 0.001355256186798215, "train/perplexity": 10.627172034282022, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021839.920657184, "perf/iters_per_sec": 0.9640884021078033, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372492790222168, "data/tokens_consumed": 10529800192, "data/tokens_consumed_B": 10.529800192, "train/loss_slope": -1.5109515583554482e-05} {"step": 5025, "timestamp": 1778331151.1910453, "eos/sharpness": 2.8198003768920894, "eos/L0_probe": 2.352875232696533, "eos/L_plus": 2.3702988624572754, "eos/L_minus": 2.363649606704712, "eos/grad_norm": 0.07796242833137512, "eos/embed_grad_frac": 0.36376485228538513, "eos/time_s": 0.6090717315673828} {"step": 5025, "timestamp": 1778331152.5895152, "geo/rankme_last": 426.7684020996094, "geo/layer_0/stable_rank_q_proj": 20.691041946411133, "geo/layer_0/stable_rank_k_proj": 17.17072296142578, "geo/layer_0/stable_rank_o_proj": 45.442840576171875, "geo/layer_0/stable_rank_gate_proj": 129.40231323242188, "geo/layer_0/stable_rank_down_proj": 56.46393966674805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06747911125421524, "geo/layer_0/attn_entropy_mean": 6.247854232788086, "geo/layer_0/attn_entropy_std": 0.42529812455177307, "geo/layer_7/stable_rank_q_proj": 42.43916320800781, "geo/layer_7/stable_rank_k_proj": 39.24744415283203, "geo/layer_7/stable_rank_o_proj": 90.19204711914062, "geo/layer_7/stable_rank_gate_proj": 79.4549331665039, "geo/layer_7/stable_rank_down_proj": 143.33038330078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4108956456184387, "geo/layer_7/attn_entropy_mean": 4.713747024536133, "geo/layer_7/attn_entropy_std": 0.7608352303504944, "geo/layer_14/stable_rank_q_proj": 51.63333511352539, "geo/layer_14/stable_rank_k_proj": 41.653961181640625, "geo/layer_14/stable_rank_o_proj": 42.65972900390625, "geo/layer_14/stable_rank_gate_proj": 72.02140808105469, "geo/layer_14/stable_rank_down_proj": 127.50371551513672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37652140855789185, "geo/layer_14/attn_entropy_mean": 5.537430763244629, "geo/layer_14/attn_entropy_std": 0.44742223620414734, "geo/layer_21/stable_rank_q_proj": 39.52861785888672, "geo/layer_21/stable_rank_k_proj": 29.148677825927734, "geo/layer_21/stable_rank_o_proj": 66.37397003173828, "geo/layer_21/stable_rank_gate_proj": 62.349761962890625, "geo/layer_21/stable_rank_down_proj": 49.71233367919922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13689368963241577, "geo/layer_21/attn_entropy_mean": 5.854410171508789, "geo/layer_21/attn_entropy_std": 0.3140423595905304, "geo/layer_27/stable_rank_q_proj": 43.49055099487305, "geo/layer_27/stable_rank_k_proj": 30.711484909057617, "geo/layer_27/stable_rank_o_proj": 109.31067657470703, "geo/layer_27/stable_rank_gate_proj": 73.42265319824219, "geo/layer_27/stable_rank_down_proj": 127.92866516113281, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10241547971963882, "geo/layer_27/attn_entropy_mean": 4.307686805725098, "geo/layer_27/attn_entropy_std": 0.6386936902999878, "attnres/final_alpha/block_0": 0.2506014406681061, "attnres/block_norm/0": 1.7778849601745605, "attnres/final_alpha/block_1": 0.004056630656123161, "attnres/block_norm/1": 49876.5859375, "attnres/final_alpha/block_2": 0.008577274158596992, "attnres/block_norm/2": 29630.015625, "attnres/final_alpha/block_3": 0.010420477949082851, "attnres/block_norm/3": 68926.65625, "attnres/final_alpha/block_4": 0.01187092438340187, "attnres/block_norm/4": 16835.48828125, "attnres/final_alpha/block_5": 0.6109598278999329, "attnres/block_norm/5": 7038.8212890625, "attnres/final_alpha/block_6": 0.10351341962814331, "attnres/block_norm/6": 45809.8515625, "geo/tier1_time_s": 1.3792061805725098, "geo/step": 5025.0, "geo/rankme_slope": 0.001299373694790416} {"step": 5030, "timestamp": 1778331157.7799346, "train/loss": 2.3833415269851685, "train/z_loss": 0.0013593323645181953, "train/perplexity": 10.841068128375566, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695565.449716414, "perf/iters_per_sec": 0.8085086105901785, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2368452072143554, "data/tokens_consumed": 10550771712, "data/tokens_consumed_B": 10.550771712, "train/loss_slope": -1.4138785819194677e-05} {"step": 5040, "timestamp": 1778331168.1604388, "train/loss": 2.379675602912903, "train/z_loss": 0.0013458642759360373, "train/perplexity": 10.801398353373074, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021640.244512917, "perf/iters_per_sec": 0.9639931891026101, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373517274856567, "data/tokens_consumed": 10571743232, "data/tokens_consumed_B": 10.571743232, "train/loss_slope": -1.2959951958139806e-05} {"step": 5050, "timestamp": 1778331178.5227418, "grad/layer_0/attn": 0.0033177845180034637, "grad/layer_0/mlp": 0.0035044695250689983, "grad/layer_0/attn_mlp_ratio": 0.9467294264073243, "grad/layer_4/attn": 0.005897806491702795, "grad/layer_4/mlp": 0.0026815752498805523, "grad/layer_4/attn_mlp_ratio": 2.1993812301285907, "grad/layer_8/attn": 0.007147431839257479, "grad/layer_8/mlp": 0.0035899921786040068, "grad/layer_8/attn_mlp_ratio": 1.9909323710403095, "grad/layer_12/attn": 0.00604880740866065, "grad/layer_12/mlp": 0.00801587849855423, "grad/layer_12/attn_mlp_ratio": 0.7546031709801135, "grad/layer_16/attn": 0.006083040032535791, "grad/layer_16/mlp": 0.005292840301990509, "grad/layer_16/attn_mlp_ratio": 1.1492959489668548, "grad/layer_20/attn": 0.005133680999279022, "grad/layer_20/mlp": 0.007840664125978947, "grad/layer_20/attn_mlp_ratio": 0.6547507776534145, "grad/layer_24/attn": 0.022101860493421555, "grad/layer_24/mlp": 0.016914810985326767, "grad/layer_24/attn_mlp_ratio": 1.3066572474223161, "grad/layer_27/attn": 0.005482223816215992, "grad/layer_27/mlp": 0.01660141907632351, "grad/layer_27/attn_mlp_ratio": 0.33022621487894177} {"step": 5050, "timestamp": 1778331178.5386152, "train/loss": 2.3598527908325195, "train/z_loss": 0.0013529677409678698, "train/perplexity": 10.589392482037825, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021617.5238021342, "perf/iters_per_sec": 0.9639823550234481, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037363386154175, "data/tokens_consumed": 10592714752, "data/tokens_consumed_B": 10.592714752, "train/loss_slope": -1.4075181536441253e-05} {"step": 5060, "timestamp": 1778331188.9119313, "train/loss": 2.3845788717269896, "train/z_loss": 0.001344498060643673, "train/perplexity": 10.854490569400843, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022597.998782315, "perf/iters_per_sec": 0.9644498819266868, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368605136871338, "data/tokens_consumed": 10613686272, "data/tokens_consumed_B": 10.613686272, "train/loss_slope": -1.2188174836885196e-05} {"step": 5070, "timestamp": 1778331199.2955883, "train/loss": 2.327450680732727, "train/z_loss": 0.0013508033822290598, "train/perplexity": 10.251773148621076, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021110.3205078053, "perf/iters_per_sec": 0.963740501645949, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376237154006958, "data/tokens_consumed": 10634657792, "data/tokens_consumed_B": 10.634657792, "train/loss_slope": -1.427604915356797e-05} {"step": 5080, "timestamp": 1778331209.6712062, "train/loss": 2.3760894298553468, "train/z_loss": 0.0013482934329658747, "train/perplexity": 10.762732043093795, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022218.981655224, "perf/iters_per_sec": 0.9642691524768944, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370548486709594, "data/tokens_consumed": 10655629312, "data/tokens_consumed_B": 10.655629312, "train/loss_slope": -1.547175440886979e-05} {"step": 5090, "timestamp": 1778331220.0523608, "train/loss": 2.369982051849365, "train/z_loss": 0.0013519443687982856, "train/perplexity": 10.697200287425934, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021113.8499378357, "perf/iters_per_sec": 0.9637421846093348, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376219034194947, "data/tokens_consumed": 10676600832, "data/tokens_consumed_B": 10.676600832, "train/loss_slope": -1.353981889048887e-05} {"step": 5100, "timestamp": 1778331230.415619, "grad/layer_0/attn": 0.003508572932332754, "grad/layer_0/mlp": 0.0037339446134865284, "grad/layer_0/attn_mlp_ratio": 0.9396424429264417, "grad/layer_4/attn": 0.002302864333614707, "grad/layer_4/mlp": 0.0027517424896359444, "grad/layer_4/attn_mlp_ratio": 0.8368749105705326, "grad/layer_8/attn": 0.011184916831552982, "grad/layer_8/mlp": 0.0035906508564949036, "grad/layer_8/attn_mlp_ratio": 3.1150108899672597, "grad/layer_12/attn": 0.007686536759138107, "grad/layer_12/mlp": 0.007721972186118364, "grad/layer_12/attn_mlp_ratio": 0.9954110781977348, "grad/layer_16/attn": 0.006758979521691799, "grad/layer_16/mlp": 0.0045793806202709675, "grad/layer_16/attn_mlp_ratio": 1.4759592911270025, "grad/layer_20/attn": 0.005315129645168781, "grad/layer_20/mlp": 0.007109433878213167, "grad/layer_20/attn_mlp_ratio": 0.747616429304636, "grad/layer_24/attn": 0.01914030872285366, "grad/layer_24/mlp": 0.016789134591817856, "grad/layer_24/attn_mlp_ratio": 1.140041406194784, "grad/layer_27/attn": 0.005315191578119993, "grad/layer_27/mlp": 0.018651511520147324, "grad/layer_27/attn_mlp_ratio": 0.2849737697602233} {"step": 5100, "timestamp": 1778331231.0160143, "eos/sharpness": 58.16116333007811, "eos/L0_probe": 2.349838972091675, "eos/L_plus": 2.643132448196411, "eos/L_minus": 2.6381571292877197, "eos/grad_norm": 0.25234395265579224, "eos/embed_grad_frac": 0.04886632412672043, "eos/time_s": 0.5975949764251709} {"step": 5100, "timestamp": 1778331231.0349653, "train/loss": 2.355446791648865, "train/z_loss": 0.001358849392272532, "train/perplexity": 10.542838261647486, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910732.8815226287, "perf/iters_per_sec": 0.9111084373105186, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.097564196586609, "data/tokens_consumed": 10697572352, "data/tokens_consumed_B": 10.697572352, "train/loss_slope": -1.269977129224607e-05} {"step": 5100, "timestamp": 1778331232.3961418, "geo/rankme_last": 427.1892395019531, "geo/layer_0/stable_rank_q_proj": 20.72665786743164, "geo/layer_0/stable_rank_k_proj": 17.199865341186523, "geo/layer_0/stable_rank_o_proj": 45.43068313598633, "geo/layer_0/stable_rank_gate_proj": 129.42086791992188, "geo/layer_0/stable_rank_down_proj": 56.45526123046875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07087869942188263, "geo/layer_0/attn_entropy_mean": 6.252745628356934, "geo/layer_0/attn_entropy_std": 0.42191967368125916, "geo/layer_7/stable_rank_q_proj": 42.52662658691406, "geo/layer_7/stable_rank_k_proj": 39.4621467590332, "geo/layer_7/stable_rank_o_proj": 90.16413116455078, "geo/layer_7/stable_rank_gate_proj": 79.4058837890625, "geo/layer_7/stable_rank_down_proj": 143.3752899169922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41763967275619507, "geo/layer_7/attn_entropy_mean": 4.693960189819336, "geo/layer_7/attn_entropy_std": 0.7770171761512756, "geo/layer_14/stable_rank_q_proj": 51.62983322143555, "geo/layer_14/stable_rank_k_proj": 41.74213790893555, "geo/layer_14/stable_rank_o_proj": 42.60393142700195, "geo/layer_14/stable_rank_gate_proj": 71.99441528320312, "geo/layer_14/stable_rank_down_proj": 127.66027069091797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36176663637161255, "geo/layer_14/attn_entropy_mean": 5.540071964263916, "geo/layer_14/attn_entropy_std": 0.41688427329063416, "geo/layer_21/stable_rank_q_proj": 39.537261962890625, "geo/layer_21/stable_rank_k_proj": 29.162010192871094, "geo/layer_21/stable_rank_o_proj": 66.32279205322266, "geo/layer_21/stable_rank_gate_proj": 62.288124084472656, "geo/layer_21/stable_rank_down_proj": 49.764732360839844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13355857133865356, "geo/layer_21/attn_entropy_mean": 5.853498458862305, "geo/layer_21/attn_entropy_std": 0.32349175214767456, "geo/layer_27/stable_rank_q_proj": 43.51234817504883, "geo/layer_27/stable_rank_k_proj": 30.688074111938477, "geo/layer_27/stable_rank_o_proj": 109.36029815673828, "geo/layer_27/stable_rank_gate_proj": 73.4325180053711, "geo/layer_27/stable_rank_down_proj": 127.8382797241211, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09750492870807648, "geo/layer_27/attn_entropy_mean": 4.320371627807617, "geo/layer_27/attn_entropy_std": 0.6574276089668274, "attnres/final_alpha/block_0": 0.25273749232292175, "attnres/block_norm/0": 1.7780206203460693, "attnres/final_alpha/block_1": 0.004147026687860489, "attnres/block_norm/1": 49635.234375, "attnres/final_alpha/block_2": 0.008710509166121483, "attnres/block_norm/2": 29691.37890625, "attnres/final_alpha/block_3": 0.010544326156377792, "attnres/block_norm/3": 69832.375, "attnres/final_alpha/block_4": 0.012106377631425858, "attnres/block_norm/4": 16784.5859375, "attnres/final_alpha/block_5": 0.6085947751998901, "attnres/block_norm/5": 7015.494140625, "attnres/final_alpha/block_6": 0.10315944254398346, "attnres/block_norm/6": 45921.76171875, "geo/tier1_time_s": 1.3574113845825195, "geo/step": 5100.0, "geo/rankme_slope": 0.001308541268069728} {"step": 5110, "timestamp": 1778331242.7688217, "train/loss": 2.3699763774871827, "train/z_loss": 0.0013524583308026194, "train/perplexity": 10.697139587809382, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787820.6962870085, "perf/iters_per_sec": 0.8524993401942293, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1730214357376099, "data/tokens_consumed": 10718543872, "data/tokens_consumed_B": 10.718543872, "train/loss_slope": -1.1877014298643558e-05} {"step": 5120, "timestamp": 1778331253.1501935, "train/loss": 2.417532205581665, "train/z_loss": 0.0013437791145406663, "train/perplexity": 11.21814106157763, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021408.8331065208, "perf/iters_per_sec": 0.9638828435452084, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374704837799071, "data/tokens_consumed": 10739515392, "data/tokens_consumed_B": 10.739515392, "train/loss_slope": -7.720111691840827e-06} {"step": 5130, "timestamp": 1778331263.527721, "train/loss": 2.3750891208648683, "train/z_loss": 0.001355963118840009, "train/perplexity": 10.751971368366092, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022045.2606477512, "perf/iters_per_sec": 0.9641863158453708, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371439456939697, "data/tokens_consumed": 10760486912, "data/tokens_consumed_B": 10.760486912, "train/loss_slope": -5.978208728427404e-06} {"step": 5140, "timestamp": 1778331273.89921, "train/loss": 2.366229462623596, "train/z_loss": 0.0013485630741342902, "train/perplexity": 10.6571333133474, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023098.7367560999, "perf/iters_per_sec": 0.9646886523991107, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366038799285888, "data/tokens_consumed": 10781458432, "data/tokens_consumed_B": 10.781458432, "train/loss_slope": -8.685326297255221e-06} {"step": 5150, "timestamp": 1778331284.2729468, "grad/layer_0/attn": 0.0030698117334395647, "grad/layer_0/mlp": 0.0033114526886492968, "grad/layer_0/attn_mlp_ratio": 0.9270286878200384, "grad/layer_4/attn": 0.0025981198996305466, "grad/layer_4/mlp": 0.002594494028016925, "grad/layer_4/attn_mlp_ratio": 1.0013974869221973, "grad/layer_8/attn": 0.004562488757073879, "grad/layer_8/mlp": 0.0033017611131072044, "grad/layer_8/attn_mlp_ratio": 1.3818348640603952, "grad/layer_12/attn": 0.004710420500487089, "grad/layer_12/mlp": 0.006342002656310797, "grad/layer_12/attn_mlp_ratio": 0.7427339093789965, "grad/layer_16/attn": 0.004418510943651199, "grad/layer_16/mlp": 0.004896289668977261, "grad/layer_16/attn_mlp_ratio": 0.9024202308545437, "grad/layer_20/attn": 0.0027494807727634907, "grad/layer_20/mlp": 0.0061236219480633736, "grad/layer_20/attn_mlp_ratio": 0.44899583141860105, "grad/layer_24/attn": 0.013323435559868813, "grad/layer_24/mlp": 0.01167351845651865, "grad/layer_24/attn_mlp_ratio": 1.1413384486743996, "grad/layer_27/attn": 0.005021148361265659, "grad/layer_27/mlp": 0.01273004338145256, "grad/layer_27/attn_mlp_ratio": 0.39443293093078435} {"step": 5150, "timestamp": 1778331284.2885363, "train/loss": 2.349870800971985, "train/z_loss": 0.0013556102639995515, "train/perplexity": 10.48421508682179, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019875.6375556586, "perf/iters_per_sec": 0.9631517589357655, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0382579803466796, "data/tokens_consumed": 10802429952, "data/tokens_consumed_B": 10.802429952, "train/loss_slope": -8.98529820137464e-06} {"step": 5160, "timestamp": 1778331294.6669471, "train/loss": 2.383851742744446, "train/z_loss": 0.0013548760092817248, "train/perplexity": 10.84660082349616, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021894.6677133087, "perf/iters_per_sec": 0.9641145075384658, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372211933135986, "data/tokens_consumed": 10823401472, "data/tokens_consumed_B": 10.823401472, "train/loss_slope": -7.5607269045614105e-06} {"step": 5170, "timestamp": 1778331305.045958, "train/loss": 2.359169340133667, "train/z_loss": 0.0013433994725346565, "train/perplexity": 10.582157626960552, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021717.377933841, "perf/iters_per_sec": 0.964029969183846, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373121500015259, "data/tokens_consumed": 10844372992, "data/tokens_consumed_B": 10.844372992, "train/loss_slope": -1.0195362364033268e-05} {"step": 5175, "timestamp": 1778331310.8217685, "eos/sharpness": 12.776947021484373, "eos/L0_probe": 2.3494625091552734, "eos/L_plus": 2.404175281524658, "eos/L_minus": 2.4225192070007324, "eos/grad_norm": 0.09144535660743713, "eos/embed_grad_frac": 0.2750701904296875, "eos/time_s": 0.5955758094787598} {"step": 5175, "timestamp": 1778331312.2020319, "geo/rankme_last": 427.384033203125, "geo/layer_0/stable_rank_q_proj": 20.71908950805664, "geo/layer_0/stable_rank_k_proj": 17.174663543701172, "geo/layer_0/stable_rank_o_proj": 45.35298156738281, "geo/layer_0/stable_rank_gate_proj": 129.28463745117188, "geo/layer_0/stable_rank_down_proj": 56.47938537597656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06802535057067871, "geo/layer_0/attn_entropy_mean": 6.249416351318359, "geo/layer_0/attn_entropy_std": 0.42780154943466187, "geo/layer_7/stable_rank_q_proj": 42.460845947265625, "geo/layer_7/stable_rank_k_proj": 39.360321044921875, "geo/layer_7/stable_rank_o_proj": 90.30597686767578, "geo/layer_7/stable_rank_gate_proj": 79.37173461914062, "geo/layer_7/stable_rank_down_proj": 143.70899963378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.417159765958786, "geo/layer_7/attn_entropy_mean": 4.70956563949585, "geo/layer_7/attn_entropy_std": 0.7710647583007812, "geo/layer_14/stable_rank_q_proj": 51.54206848144531, "geo/layer_14/stable_rank_k_proj": 41.66958236694336, "geo/layer_14/stable_rank_o_proj": 42.50982666015625, "geo/layer_14/stable_rank_gate_proj": 71.90333557128906, "geo/layer_14/stable_rank_down_proj": 127.32487487792969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37108850479125977, "geo/layer_14/attn_entropy_mean": 5.514100074768066, "geo/layer_14/attn_entropy_std": 0.42976951599121094, "geo/layer_21/stable_rank_q_proj": 39.48179244995117, "geo/layer_21/stable_rank_k_proj": 29.18086814880371, "geo/layer_21/stable_rank_o_proj": 66.19412231445312, "geo/layer_21/stable_rank_gate_proj": 62.29460525512695, "geo/layer_21/stable_rank_down_proj": 49.73123550415039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13514557480812073, "geo/layer_21/attn_entropy_mean": 5.893039226531982, "geo/layer_21/attn_entropy_std": 0.31659406423568726, "geo/layer_27/stable_rank_q_proj": 43.49433135986328, "geo/layer_27/stable_rank_k_proj": 30.663372039794922, "geo/layer_27/stable_rank_o_proj": 109.20384216308594, "geo/layer_27/stable_rank_gate_proj": 73.3972396850586, "geo/layer_27/stable_rank_down_proj": 127.71662139892578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09151649475097656, "geo/layer_27/attn_entropy_mean": 4.319657325744629, "geo/layer_27/attn_entropy_std": 0.6496244072914124, "attnres/final_alpha/block_0": 0.25092583894729614, "attnres/block_norm/0": 1.7780283689498901, "attnres/final_alpha/block_1": 0.004113705828785896, "attnres/block_norm/1": 49782.0078125, "attnres/final_alpha/block_2": 0.008629950694739819, "attnres/block_norm/2": 29602.46875, "attnres/final_alpha/block_3": 0.010448572225868702, "attnres/block_norm/3": 68998.578125, "attnres/final_alpha/block_4": 0.012074709869921207, "attnres/block_norm/4": 16870.279296875, "attnres/final_alpha/block_5": 0.6111214756965637, "attnres/block_norm/5": 6991.6796875, "attnres/final_alpha/block_6": 0.10268572717905045, "attnres/block_norm/6": 45762.453125, "geo/tier1_time_s": 1.3594520092010498, "geo/step": 5175.0, "geo/rankme_slope": 0.0012793386495223089} {"step": 5180, "timestamp": 1778331317.3964367, "train/loss": 2.3825262546539308, "train/z_loss": 0.001347159408032894, "train/perplexity": 10.832233307371348, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698903.4872537907, "perf/iters_per_sec": 0.8101003109234766, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2344150304794312, "data/tokens_consumed": 10865344512, "data/tokens_consumed_B": 10.865344512, "train/loss_slope": -7.307427186276285e-06} {"step": 5190, "timestamp": 1778331327.7731245, "train/loss": 2.3365633487701416, "train/z_loss": 0.0013527189148589969, "train/perplexity": 10.345621107333614, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021919.3002081662, "perf/iters_per_sec": 0.9641262532273132, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372085571289062, "data/tokens_consumed": 10886316032, "data/tokens_consumed_B": 10.886316032, "train/loss_slope": -6.96405351537496e-06} {"step": 5200, "timestamp": 1778331338.1325252, "grad/layer_0/attn": 0.0027617455925792456, "grad/layer_0/mlp": 0.0028355512768030167, "grad/layer_0/attn_mlp_ratio": 0.9739712759826681, "grad/layer_4/attn": 0.001720153377391398, "grad/layer_4/mlp": 0.00253720092587173, "grad/layer_4/attn_mlp_ratio": 0.67797283693766, "grad/layer_8/attn": 0.0038153871428221464, "grad/layer_8/mlp": 0.003559047356247902, "grad/layer_8/attn_mlp_ratio": 1.0720248015025033, "grad/layer_12/attn": 0.008812331594526768, "grad/layer_12/mlp": 0.006958615966141224, "grad/layer_12/attn_mlp_ratio": 1.2663914075393858, "grad/layer_16/attn": 0.0057843346148729324, "grad/layer_16/mlp": 0.004561857786029577, "grad/layer_16/attn_mlp_ratio": 1.2679778194290354, "grad/layer_20/attn": 0.0033132892567664385, "grad/layer_20/mlp": 0.005773502867668867, "grad/layer_20/attn_mlp_ratio": 0.5738785058777282, "grad/layer_24/attn": 0.007023850455880165, "grad/layer_24/mlp": 0.007591187953948975, "grad/layer_24/attn_mlp_ratio": 0.9252636617566499, "grad/layer_27/attn": 0.006115963216871023, "grad/layer_27/mlp": 0.006596064195036888, "grad/layer_27/attn_mlp_ratio": 0.927214008734404} {"step": 5200, "timestamp": 1778331338.1484623, "train/loss": 2.350735974311829, "train/z_loss": 0.0013586252345703543, "train/perplexity": 10.49328967518401, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022262.5444263115, "perf/iters_per_sec": 0.9642899248248632, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370325088500976, "data/tokens_consumed": 10907287552, "data/tokens_consumed_B": 10.907287552, "train/loss_slope": -7.138875150980905e-06} {"step": 5210, "timestamp": 1778331348.5097036, "train/loss": 2.3972166776657104, "train/z_loss": 0.001350088557228446, "train/perplexity": 10.99253798567039, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025824.6666390602, "perf/iters_per_sec": 0.9659884770579625, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352090358734132, "data/tokens_consumed": 10928259072, "data/tokens_consumed_B": 10.928259072, "train/loss_slope": -4.113797371787271e-06} {"step": 5220, "timestamp": 1778331358.858764, "train/loss": 2.328282356262207, "train/z_loss": 0.0013643398531712591, "train/perplexity": 10.260302843960355, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027218.373569021, "perf/iters_per_sec": 0.966653048309813, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034497332572937, "data/tokens_consumed": 10949230592, "data/tokens_consumed_B": 10.949230592, "train/loss_slope": -6.024935035922938e-06} {"step": 5230, "timestamp": 1778331369.2077463, "train/loss": 2.380228042602539, "train/z_loss": 0.0013555224984884262, "train/perplexity": 10.807367123067886, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027352.0976218255, "perf/iters_per_sec": 0.9667168129071357, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034429097175598, "data/tokens_consumed": 10970202112, "data/tokens_consumed_B": 10.970202112, "train/loss_slope": -5.640995856558823e-06} {"step": 5240, "timestamp": 1778331379.5596015, "train/loss": 2.3610101699829102, "train/z_loss": 0.001347801589872688, "train/perplexity": 10.601655519234736, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027193.3314728646, "perf/iters_per_sec": 0.9666411073078464, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034510111808777, "data/tokens_consumed": 10991173632, "data/tokens_consumed_B": 10.991173632, "train/loss_slope": -5.217783607736881e-06} {"step": 5250, "timestamp": 1778331390.2244577, "grad/layer_0/attn": 0.0025931205600500107, "grad/layer_0/mlp": 0.003135493490844965, "grad/layer_0/attn_mlp_ratio": 0.8270214831953161, "grad/layer_4/attn": 0.00171132932882756, "grad/layer_4/mlp": 0.002664608182385564, "grad/layer_4/attn_mlp_ratio": 0.642244242855631, "grad/layer_8/attn": 0.0041123926639556885, "grad/layer_8/mlp": 0.0036538129206746817, "grad/layer_8/attn_mlp_ratio": 1.125507145736847, "grad/layer_12/attn": 0.006675326265394688, "grad/layer_12/mlp": 0.006566740106791258, "grad/layer_12/attn_mlp_ratio": 1.0165357628266047, "grad/layer_16/attn": 0.006122008431702852, "grad/layer_16/mlp": 0.004004987422376871, "grad/layer_16/attn_mlp_ratio": 1.5285961310734812, "grad/layer_20/attn": 0.0040278867818415165, "grad/layer_20/mlp": 0.00549613730981946, "grad/layer_20/attn_mlp_ratio": 0.7328577292564137, "grad/layer_24/attn": 0.005047547165304422, "grad/layer_24/mlp": 0.008021720685064793, "grad/layer_24/attn_mlp_ratio": 0.6292349609951742, "grad/layer_27/attn": 0.004079286474734545, "grad/layer_27/mlp": 0.007085246033966541, "grad/layer_27/attn_mlp_ratio": 0.5757437917616609} {"step": 5250, "timestamp": 1778331390.8270028, "eos/sharpness": 6.963586807250975, "eos/L0_probe": 2.3477230072021484, "eos/L_plus": 2.3870437145233154, "eos/L_minus": 2.378038167953491, "eos/grad_norm": 0.09519200772047043, "eos/embed_grad_frac": 0.2783220708370209, "eos/time_s": 0.5993406772613525} {"step": 5250, "timestamp": 1778331390.8483236, "train/loss": 2.36905722618103, "train/z_loss": 0.0013441997929476202, "train/perplexity": 10.687311815282035, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1858775.0417856495, "perf/iters_per_sec": 0.886333008663964, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.128244113922119, "data/tokens_consumed": 11012145152, "data/tokens_consumed_B": 11.012145152, "train/loss_slope": -2.4728224365480637e-06} {"step": 5250, "timestamp": 1778331392.2135043, "geo/rankme_last": 426.6791076660156, "geo/layer_0/stable_rank_q_proj": 20.712251663208008, "geo/layer_0/stable_rank_k_proj": 17.199129104614258, "geo/layer_0/stable_rank_o_proj": 45.29945755004883, "geo/layer_0/stable_rank_gate_proj": 129.1641845703125, "geo/layer_0/stable_rank_down_proj": 56.51066207885742, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06172889471054077, "geo/layer_0/attn_entropy_mean": 6.249936103820801, "geo/layer_0/attn_entropy_std": 0.4303330183029175, "geo/layer_7/stable_rank_q_proj": 42.4396858215332, "geo/layer_7/stable_rank_k_proj": 39.39598846435547, "geo/layer_7/stable_rank_o_proj": 90.24620056152344, "geo/layer_7/stable_rank_gate_proj": 79.29371643066406, "geo/layer_7/stable_rank_down_proj": 143.6387939453125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3972780704498291, "geo/layer_7/attn_entropy_mean": 4.736961364746094, "geo/layer_7/attn_entropy_std": 0.7680623531341553, "geo/layer_14/stable_rank_q_proj": 51.57667541503906, "geo/layer_14/stable_rank_k_proj": 41.59755325317383, "geo/layer_14/stable_rank_o_proj": 42.599910736083984, "geo/layer_14/stable_rank_gate_proj": 72.08502197265625, "geo/layer_14/stable_rank_down_proj": 127.47586059570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3611370027065277, "geo/layer_14/attn_entropy_mean": 5.514677047729492, "geo/layer_14/attn_entropy_std": 0.4395677149295807, "geo/layer_21/stable_rank_q_proj": 39.370845794677734, "geo/layer_21/stable_rank_k_proj": 29.04990577697754, "geo/layer_21/stable_rank_o_proj": 66.25055694580078, "geo/layer_21/stable_rank_gate_proj": 62.28200912475586, "geo/layer_21/stable_rank_down_proj": 49.70090866088867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14155898988246918, "geo/layer_21/attn_entropy_mean": 5.871044158935547, "geo/layer_21/attn_entropy_std": 0.3224931061267853, "geo/layer_27/stable_rank_q_proj": 43.47734832763672, "geo/layer_27/stable_rank_k_proj": 30.750444412231445, "geo/layer_27/stable_rank_o_proj": 109.18804168701172, "geo/layer_27/stable_rank_gate_proj": 73.2756576538086, "geo/layer_27/stable_rank_down_proj": 127.7380142211914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1024463027715683, "geo/layer_27/attn_entropy_mean": 4.330187797546387, "geo/layer_27/attn_entropy_std": 0.6557467579841614, "attnres/final_alpha/block_0": 0.2512698769569397, "attnres/block_norm/0": 1.7779361009597778, "attnres/final_alpha/block_1": 0.004096281249076128, "attnres/block_norm/1": 49831.9921875, "attnres/final_alpha/block_2": 0.008622356690466404, "attnres/block_norm/2": 29712.3984375, "attnres/final_alpha/block_3": 0.01051512360572815, "attnres/block_norm/3": 68834.1015625, "attnres/final_alpha/block_4": 0.012137049809098244, "attnres/block_norm/4": 16815.845703125, "attnres/final_alpha/block_5": 0.6111389398574829, "attnres/block_norm/5": 6939.4716796875, "attnres/final_alpha/block_6": 0.10222041606903076, "attnres/block_norm/6": 45922.09375, "geo/tier1_time_s": 1.3609941005706787, "geo/step": 5250.0, "geo/rankme_slope": 0.0012214765007565525} {"step": 5260, "timestamp": 1778331402.586781, "train/loss": 2.355365037918091, "train/z_loss": 0.001355446595698595, "train/perplexity": 10.541976380518127, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787170.1238963513, "perf/iters_per_sec": 0.852189123104263, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1734484434127808, "data/tokens_consumed": 11033116672, "data/tokens_consumed_B": 11.033116672, "train/loss_slope": -2.469007868041358e-06} {"step": 5270, "timestamp": 1778331412.9334705, "train/loss": 2.376430606842041, "train/z_loss": 0.0013335389900021254, "train/perplexity": 10.766404666052448, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028272.1972013137, "perf/iters_per_sec": 0.9671555505758828, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339598417282105, "data/tokens_consumed": 11054088192, "data/tokens_consumed_B": 11.054088192, "train/loss_slope": -3.6695547158245876e-06} {"step": 5280, "timestamp": 1778331423.325663, "train/loss": 2.33391432762146, "train/z_loss": 0.0013532967888750136, "train/perplexity": 10.31825160542376, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018977.9677590348, "perf/iters_per_sec": 0.9627237166209387, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038719606399536, "data/tokens_consumed": 11075059712, "data/tokens_consumed_B": 11.075059712, "train/loss_slope": -6.1901424441149556e-06} {"step": 5290, "timestamp": 1778331433.686041, "train/loss": 2.3472882509231567, "train/z_loss": 0.0013548424234613777, "train/perplexity": 10.457174009134993, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025605.4972380577, "perf/iters_per_sec": 0.9658839689436234, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035321044921875, "data/tokens_consumed": 11096031232, "data/tokens_consumed_B": 11.096031232, "train/loss_slope": -5.784106977058125e-06} {"step": 5300, "timestamp": 1778331444.0268505, "grad/layer_0/attn": 0.003236526157706976, "grad/layer_0/mlp": 0.0034351598005741835, "grad/layer_0/attn_mlp_ratio": 0.9421762745792398, "grad/layer_4/attn": 0.0020270701497793198, "grad/layer_4/mlp": 0.0026602488942444324, "grad/layer_4/attn_mlp_ratio": 0.7619851202518925, "grad/layer_8/attn": 0.002729119500145316, "grad/layer_8/mlp": 0.0033095143735408783, "grad/layer_8/attn_mlp_ratio": 0.8246283622459604, "grad/layer_12/attn": 0.0045930310152471066, "grad/layer_12/mlp": 0.006042399909347296, "grad/layer_12/attn_mlp_ratio": 0.7601335575502969, "grad/layer_16/attn": 0.003648576559498906, "grad/layer_16/mlp": 0.0042383442632853985, "grad/layer_16/attn_mlp_ratio": 0.8608494843186062, "grad/layer_20/attn": 0.0028553998563438654, "grad/layer_20/mlp": 0.005820606369525194, "grad/layer_20/attn_mlp_ratio": 0.49056741274192167, "grad/layer_24/attn": 0.010796387679874897, "grad/layer_24/mlp": 0.009887921623885632, "grad/layer_24/attn_mlp_ratio": 1.0918763296633651, "grad/layer_27/attn": 0.005328087136149406, "grad/layer_27/mlp": 0.00997723639011383, "grad/layer_27/attn_mlp_ratio": 0.5340243404502701} {"step": 5300, "timestamp": 1778331444.0424638, "train/loss": 2.3975712776184084, "train/z_loss": 0.0013564569992013276, "train/perplexity": 10.99643663030889, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026525.2723778058, "perf/iters_per_sec": 0.9663225519074468, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348511457443237, "data/tokens_consumed": 11117002752, "data/tokens_consumed_B": 11.117002752, "train/loss_slope": -2.5891436208592207e-06} {"step": 5310, "timestamp": 1778331454.4069111, "train/loss": 2.373623251914978, "train/z_loss": 0.0013432521955110133, "train/perplexity": 10.736221933511022, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024336.572408207, "perf/iters_per_sec": 0.965278898433784, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359700202941895, "data/tokens_consumed": 11137974272, "data/tokens_consumed_B": 11.137974272, "train/loss_slope": -1.6777402389667612e-06} {"step": 5320, "timestamp": 1778331464.7630582, "train/loss": 2.314013671875, "train/z_loss": 0.0013620835379697382, "train/perplexity": 10.114941345867333, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026312.2997095212, "perf/iters_per_sec": 0.966220998625527, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349599123001099, "data/tokens_consumed": 11158945792, "data/tokens_consumed_B": 11.158945792, "train/loss_slope": -3.0447150006366508e-06} {"step": 5325, "timestamp": 1778331470.5175292, "eos/sharpness": 46.65005207061767, "eos/L0_probe": 2.3505630493164062, "eos/L_plus": 2.539240837097168, "eos/L_minus": 2.6283857822418213, "eos/grad_norm": 0.14069660007953644, "eos/embed_grad_frac": 0.1333017498254776, "eos/time_s": 0.5915076732635498} {"step": 5325, "timestamp": 1778331471.8960075, "geo/rankme_last": 427.2911376953125, "geo/layer_0/stable_rank_q_proj": 20.71360206604004, "geo/layer_0/stable_rank_k_proj": 17.227231979370117, "geo/layer_0/stable_rank_o_proj": 45.254093170166016, "geo/layer_0/stable_rank_gate_proj": 129.21131896972656, "geo/layer_0/stable_rank_down_proj": 56.51520538330078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06148804351687431, "geo/layer_0/attn_entropy_mean": 6.2491254806518555, "geo/layer_0/attn_entropy_std": 0.42925894260406494, "geo/layer_7/stable_rank_q_proj": 42.38719940185547, "geo/layer_7/stable_rank_k_proj": 39.36442565917969, "geo/layer_7/stable_rank_o_proj": 90.1519775390625, "geo/layer_7/stable_rank_gate_proj": 79.30292510986328, "geo/layer_7/stable_rank_down_proj": 143.6822509765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4079323410987854, "geo/layer_7/attn_entropy_mean": 4.734854221343994, "geo/layer_7/attn_entropy_std": 0.7726791501045227, "geo/layer_14/stable_rank_q_proj": 51.51304626464844, "geo/layer_14/stable_rank_k_proj": 41.712547302246094, "geo/layer_14/stable_rank_o_proj": 42.57429885864258, "geo/layer_14/stable_rank_gate_proj": 72.03814697265625, "geo/layer_14/stable_rank_down_proj": 127.31718444824219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3631816804409027, "geo/layer_14/attn_entropy_mean": 5.515841960906982, "geo/layer_14/attn_entropy_std": 0.44454294443130493, "geo/layer_21/stable_rank_q_proj": 39.447723388671875, "geo/layer_21/stable_rank_k_proj": 29.06707191467285, "geo/layer_21/stable_rank_o_proj": 66.19482421875, "geo/layer_21/stable_rank_gate_proj": 62.26462173461914, "geo/layer_21/stable_rank_down_proj": 49.710384368896484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13221359252929688, "geo/layer_21/attn_entropy_mean": 5.874409198760986, "geo/layer_21/attn_entropy_std": 0.31853246688842773, "geo/layer_27/stable_rank_q_proj": 43.441925048828125, "geo/layer_27/stable_rank_k_proj": 30.740436553955078, "geo/layer_27/stable_rank_o_proj": 109.01492309570312, "geo/layer_27/stable_rank_gate_proj": 73.1791763305664, "geo/layer_27/stable_rank_down_proj": 127.85263061523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10082338750362396, "geo/layer_27/attn_entropy_mean": 4.325949192047119, "geo/layer_27/attn_entropy_std": 0.6294901967048645, "attnres/final_alpha/block_0": 0.2509309649467468, "attnres/block_norm/0": 1.778025507926941, "attnres/final_alpha/block_1": 0.004055236931890249, "attnres/block_norm/1": 49909.046875, "attnres/final_alpha/block_2": 0.008642427623271942, "attnres/block_norm/2": 29693.49609375, "attnres/final_alpha/block_3": 0.010626137256622314, "attnres/block_norm/3": 69357.25, "attnres/final_alpha/block_4": 0.012000352144241333, "attnres/block_norm/4": 16824.234375, "attnres/final_alpha/block_5": 0.6126952171325684, "attnres/block_norm/5": 6952.8671875, "attnres/final_alpha/block_6": 0.10104967653751373, "attnres/block_norm/6": 45899.2734375, "geo/tier1_time_s": 1.3596546649932861, "geo/step": 5325.0, "geo/rankme_slope": 0.0011794469350240096} {"step": 5330, "timestamp": 1778331477.0713177, "train/loss": 2.3655371189117433, "train/z_loss": 0.001360377948731184, "train/perplexity": 10.649757467716327, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704800.0899231837, "perf/iters_per_sec": 0.8129120301834029, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2301454067230224, "data/tokens_consumed": 11179917312, "data/tokens_consumed_B": 11.179917312, "train/loss_slope": -5.1153728062777345e-06} {"step": 5340, "timestamp": 1778331487.421278, "train/loss": 2.3879202604293823, "train/z_loss": 0.00134267135290429, "train/perplexity": 10.890820303640025, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027140.1657900116, "perf/iters_per_sec": 0.9666157559347208, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345372438430787, "data/tokens_consumed": 11200888832, "data/tokens_consumed_B": 11.200888832, "train/loss_slope": -4.486955067004737e-06} {"step": 5350, "timestamp": 1778331497.7594988, "grad/layer_0/attn": 0.0031131471041589975, "grad/layer_0/mlp": 0.0034273138735443354, "grad/layer_0/attn_mlp_ratio": 0.9083343773548578, "grad/layer_4/attn": 0.0021735457703471184, "grad/layer_4/mlp": 0.0026758138556033373, "grad/layer_4/attn_mlp_ratio": 0.8122933082830983, "grad/layer_8/attn": 0.004130477551370859, "grad/layer_8/mlp": 0.003402096452191472, "grad/layer_8/attn_mlp_ratio": 1.214097685943157, "grad/layer_12/attn": 0.005173726938664913, "grad/layer_12/mlp": 0.0072294920682907104, "grad/layer_12/attn_mlp_ratio": 0.7156418207847859, "grad/layer_16/attn": 0.004220263566821814, "grad/layer_16/mlp": 0.004719255957752466, "grad/layer_16/attn_mlp_ratio": 0.8942645864466409, "grad/layer_20/attn": 0.003292869543656707, "grad/layer_20/mlp": 0.006099746096879244, "grad/layer_20/attn_mlp_ratio": 0.5398371403291184, "grad/layer_24/attn": 0.006226108409464359, "grad/layer_24/mlp": 0.009419220499694347, "grad/layer_24/attn_mlp_ratio": 0.6610003814611154, "grad/layer_27/attn": 0.00803970918059349, "grad/layer_27/mlp": 0.007356771733611822, "grad/layer_27/attn_mlp_ratio": 1.092831117020844} {"step": 5350, "timestamp": 1778331497.7753131, "train/loss": 2.3492135047912597, "train/z_loss": 0.001353369140997529, "train/perplexity": 10.477326116582258, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026671.7929388194, "perf/iters_per_sec": 0.9663924183553788, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347763299942017, "data/tokens_consumed": 11221860352, "data/tokens_consumed_B": 11.221860352, "train/loss_slope": -5.770518305492604e-06} {"step": 5360, "timestamp": 1778331508.1208084, "train/loss": 2.367488241195679, "train/z_loss": 0.001341875665821135, "train/perplexity": 10.670556731184963, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028429.02743668, "perf/iters_per_sec": 0.9672303330596351, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338798999786376, "data/tokens_consumed": 11242831872, "data/tokens_consumed_B": 11.242831872, "train/loss_slope": -5.0420909896470135e-06} {"step": 5370, "timestamp": 1778331518.476378, "train/loss": 2.3954071760177613, "train/z_loss": 0.0013466927339322865, "train/perplexity": 10.972664955632993, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026081.4976002153, "perf/iters_per_sec": 0.9661109436036183, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350778102874756, "data/tokens_consumed": 11263803392, "data/tokens_consumed_B": 11.263803392, "train/loss_slope": -2.506533922320704e-06} {"step": 5380, "timestamp": 1778331528.822217, "train/loss": 2.3646825790405273, "train/z_loss": 0.0013441350194625557, "train/perplexity": 10.640660712664895, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028330.7542970076, "perf/iters_per_sec": 0.9671834727749861, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033929991722107, "data/tokens_consumed": 11284774912, "data/tokens_consumed_B": 11.284774912, "train/loss_slope": 1.0767921684670777e-06} {"step": 5390, "timestamp": 1778331539.6704423, "train/loss": 2.379384994506836, "train/z_loss": 0.0013407746562734246, "train/perplexity": 10.7982598322767, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934371.791960558, "perf/iters_per_sec": 0.922380348186759, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0841514587402343, "data/tokens_consumed": 11305746432, "data/tokens_consumed_B": 11.305746432, "train/loss_slope": 3.794059501623168e-07} {"step": 5400, "timestamp": 1778331550.0095606, "grad/layer_0/attn": 0.003562007565051317, "grad/layer_0/mlp": 0.003706754418089986, "grad/layer_0/attn_mlp_ratio": 0.9609504885386222, "grad/layer_4/attn": 0.00253005581907928, "grad/layer_4/mlp": 0.002731715328991413, "grad/layer_4/attn_mlp_ratio": 0.9261783977306198, "grad/layer_8/attn": 0.003356399480253458, "grad/layer_8/mlp": 0.0033818332012742758, "grad/layer_8/attn_mlp_ratio": 0.9924792800960254, "grad/layer_12/attn": 0.00782125722616911, "grad/layer_12/mlp": 0.007359163835644722, "grad/layer_12/attn_mlp_ratio": 1.0627915473232226, "grad/layer_16/attn": 0.004913009703159332, "grad/layer_16/mlp": 0.005148388911038637, "grad/layer_16/attn_mlp_ratio": 0.9542809785013081, "grad/layer_20/attn": 0.005737596191465855, "grad/layer_20/mlp": 0.007389660459011793, "grad/layer_20/attn_mlp_ratio": 0.7764356895214595, "grad/layer_24/attn": 0.014613581821322441, "grad/layer_24/mlp": 0.014749799855053425, "grad/layer_24/attn_mlp_ratio": 0.9907647470375139, "grad/layer_27/attn": 0.009115047752857208, "grad/layer_27/mlp": 0.014793417416512966, "grad/layer_27/attn_mlp_ratio": 0.6161556477860948} {"step": 5400, "timestamp": 1778331550.6101768, "eos/sharpness": 32.11464881896972, "eos/L0_probe": 2.350080966949463, "eos/L_plus": 2.532907485961914, "eos/L_minus": 2.488400936126709, "eos/grad_norm": 0.1929747611284256, "eos/embed_grad_frac": 0.08174685388803482, "eos/time_s": 0.5976474285125732} {"step": 5400, "timestamp": 1778331550.630959, "train/loss": 2.332233357429504, "train/z_loss": 0.001354022731538862, "train/perplexity": 10.300921501816488, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914384.8990772245, "perf/iters_per_sec": 0.91284985498296, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.095470404624939, "data/tokens_consumed": 11326717952, "data/tokens_consumed_B": 11.326717952, "train/loss_slope": -2.2796802824050387e-06} {"step": 5400, "timestamp": 1778331551.9949362, "geo/rankme_last": 427.4221496582031, "geo/layer_0/stable_rank_q_proj": 20.722549438476562, "geo/layer_0/stable_rank_k_proj": 17.228755950927734, "geo/layer_0/stable_rank_o_proj": 45.27238464355469, "geo/layer_0/stable_rank_gate_proj": 128.98731994628906, "geo/layer_0/stable_rank_down_proj": 56.46189880371094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06583674997091293, "geo/layer_0/attn_entropy_mean": 6.252511501312256, "geo/layer_0/attn_entropy_std": 0.433977872133255, "geo/layer_7/stable_rank_q_proj": 42.37260055541992, "geo/layer_7/stable_rank_k_proj": 39.270870208740234, "geo/layer_7/stable_rank_o_proj": 90.17203521728516, "geo/layer_7/stable_rank_gate_proj": 79.3245620727539, "geo/layer_7/stable_rank_down_proj": 143.41168212890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4142763018608093, "geo/layer_7/attn_entropy_mean": 4.759885787963867, "geo/layer_7/attn_entropy_std": 0.7688595056533813, "geo/layer_14/stable_rank_q_proj": 51.53688430786133, "geo/layer_14/stable_rank_k_proj": 41.731483459472656, "geo/layer_14/stable_rank_o_proj": 42.59077453613281, "geo/layer_14/stable_rank_gate_proj": 72.09114837646484, "geo/layer_14/stable_rank_down_proj": 127.43441009521484, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3676827847957611, "geo/layer_14/attn_entropy_mean": 5.507773399353027, "geo/layer_14/attn_entropy_std": 0.4396558105945587, "geo/layer_21/stable_rank_q_proj": 39.37080383300781, "geo/layer_21/stable_rank_k_proj": 29.09560775756836, "geo/layer_21/stable_rank_o_proj": 66.09756469726562, "geo/layer_21/stable_rank_gate_proj": 62.11482620239258, "geo/layer_21/stable_rank_down_proj": 49.72697830200195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14402464032173157, "geo/layer_21/attn_entropy_mean": 5.873207092285156, "geo/layer_21/attn_entropy_std": 0.31618568301200867, "geo/layer_27/stable_rank_q_proj": 43.48658752441406, "geo/layer_27/stable_rank_k_proj": 30.682376861572266, "geo/layer_27/stable_rank_o_proj": 108.92970275878906, "geo/layer_27/stable_rank_gate_proj": 73.1084976196289, "geo/layer_27/stable_rank_down_proj": 128.22564697265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09187193214893341, "geo/layer_27/attn_entropy_mean": 4.343557357788086, "geo/layer_27/attn_entropy_std": 0.6550231575965881, "attnres/final_alpha/block_0": 0.2513119578361511, "attnres/block_norm/0": 1.7782158851623535, "attnres/final_alpha/block_1": 0.00408712774515152, "attnres/block_norm/1": 49806.265625, "attnres/final_alpha/block_2": 0.008689600974321365, "attnres/block_norm/2": 29597.357421875, "attnres/final_alpha/block_3": 0.010462409816682339, "attnres/block_norm/3": 68918.5, "attnres/final_alpha/block_4": 0.011888628825545311, "attnres/block_norm/4": 16884.681640625, "attnres/final_alpha/block_5": 0.610132098197937, "attnres/block_norm/5": 7104.12548828125, "attnres/final_alpha/block_6": 0.1034281775355339, "attnres/block_norm/6": 46154.703125, "geo/tier1_time_s": 1.3604440689086914, "geo/step": 5400.0, "geo/rankme_slope": 0.001164167893719988} {"step": 5410, "timestamp": 1778331562.358807, "train/loss": 2.38046658039093, "train/z_loss": 0.0013511414406821131, "train/perplexity": 10.80994539601529, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788697.7383539083, "perf/iters_per_sec": 0.8529175464410345, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1724462747573852, "data/tokens_consumed": 11347689472, "data/tokens_consumed_B": 11.347689472, "train/loss_slope": -3.53766205859474e-06} {"step": 5420, "timestamp": 1778331572.7106955, "train/loss": 2.4116191625595094, "train/z_loss": 0.0013434895197860896, "train/perplexity": 11.15200344085357, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027103.9605283984, "perf/iters_per_sec": 0.9665984919206612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034555721282959, "data/tokens_consumed": 11368660992, "data/tokens_consumed_B": 11.368660992, "train/loss_slope": 5.124416431435506e-07} {"step": 5430, "timestamp": 1778331583.057536, "train/loss": 2.339110803604126, "train/z_loss": 0.0013491071993485093, "train/perplexity": 10.372009707446733, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027799.187346169, "perf/iters_per_sec": 0.9669300019007535, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342010259628296, "data/tokens_consumed": 11389632512, "data/tokens_consumed_B": 11.389632512, "train/loss_slope": -2.5603763627247156e-06} {"step": 5440, "timestamp": 1778331593.411533, "train/loss": 2.388593554496765, "train/z_loss": 0.0013394374283961952, "train/perplexity": 10.898155497433528, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026216.7519932317, "perf/iters_per_sec": 0.9661754379240187, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035008716583252, "data/tokens_consumed": 11410604032, "data/tokens_consumed_B": 11.410604032, "train/loss_slope": -3.3320234374816257e-06} {"step": 5450, "timestamp": 1778331603.746233, "grad/layer_0/attn": 0.002910887822508812, "grad/layer_0/mlp": 0.003144550370052457, "grad/layer_0/attn_mlp_ratio": 0.9256928296209705, "grad/layer_4/attn": 0.0023689521476626396, "grad/layer_4/mlp": 0.0027038033585995436, "grad/layer_4/attn_mlp_ratio": 0.8761554543205063, "grad/layer_8/attn": 0.004428850021213293, "grad/layer_8/mlp": 0.003369695506989956, "grad/layer_8/attn_mlp_ratio": 1.3143175342088085, "grad/layer_12/attn": 0.005423394963145256, "grad/layer_12/mlp": 0.006783775519579649, "grad/layer_12/attn_mlp_ratio": 0.7994655583082674, "grad/layer_16/attn": 0.0035960644017904997, "grad/layer_16/mlp": 0.004736003000289202, "grad/layer_16/attn_mlp_ratio": 0.7593036418348011, "grad/layer_20/attn": 0.0032646090257912874, "grad/layer_20/mlp": 0.00597954960539937, "grad/layer_20/attn_mlp_ratio": 0.5459623527911198, "grad/layer_24/attn": 0.0052194735035300255, "grad/layer_24/mlp": 0.007989926263689995, "grad/layer_24/attn_mlp_ratio": 0.6532567718333153, "grad/layer_27/attn": 0.003898301161825657, "grad/layer_27/mlp": 0.007472863886505365, "grad/layer_27/attn_mlp_ratio": 0.5216609279742386} {"step": 5450, "timestamp": 1778331603.7617989, "train/loss": 2.3810928106307983, "train/z_loss": 0.0013560958905145526, "train/perplexity": 10.816717030793535, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027144.6039433214, "perf/iters_per_sec": 0.9666178722111327, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345349788665772, "data/tokens_consumed": 11431575552, "data/tokens_consumed_B": 11.431575552, "train/loss_slope": -1.7891641068021775e-06} {"step": 5460, "timestamp": 1778331614.6540127, "train/loss": 2.3814218521118162, "train/z_loss": 0.001355381216853857, "train/perplexity": 10.82027676500309, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926667.7661307654, "perf/iters_per_sec": 0.9187067824033572, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0884865760803222, "data/tokens_consumed": 11452547072, "data/tokens_consumed_B": 11.452547072, "train/loss_slope": -1.7550407785071525e-06} {"step": 5470, "timestamp": 1778331625.0108025, "train/loss": 2.3480871438980104, "train/z_loss": 0.0013555590179748834, "train/perplexity": 10.465531509917545, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026047.7101568088, "perf/iters_per_sec": 0.9660948324951214, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350950717926026, "data/tokens_consumed": 11473518592, "data/tokens_consumed_B": 11.473518592, "train/loss_slope": -3.4827310331511885e-06} {"step": 5475, "timestamp": 1778331630.7694125, "eos/sharpness": 43.3938980102539, "eos/L0_probe": 2.3492307662963867, "eos/L_plus": 2.553368091583252, "eos/L_minus": 2.5790324211120605, "eos/grad_norm": 0.13237786293029785, "eos/embed_grad_frac": 0.14424705505371094, "eos/time_s": 0.5955455303192139} {"step": 5475, "timestamp": 1778331632.1458488, "geo/rankme_last": 427.61865234375, "geo/layer_0/stable_rank_q_proj": 20.721399307250977, "geo/layer_0/stable_rank_k_proj": 17.237558364868164, "geo/layer_0/stable_rank_o_proj": 45.20381546020508, "geo/layer_0/stable_rank_gate_proj": 128.8455047607422, "geo/layer_0/stable_rank_down_proj": 56.44035339355469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06686422973871231, "geo/layer_0/attn_entropy_mean": 6.256044864654541, "geo/layer_0/attn_entropy_std": 0.43304309248924255, "geo/layer_7/stable_rank_q_proj": 42.39788818359375, "geo/layer_7/stable_rank_k_proj": 39.362850189208984, "geo/layer_7/stable_rank_o_proj": 90.11115264892578, "geo/layer_7/stable_rank_gate_proj": 79.4251937866211, "geo/layer_7/stable_rank_down_proj": 143.5552978515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4114783704280853, "geo/layer_7/attn_entropy_mean": 4.733415603637695, "geo/layer_7/attn_entropy_std": 0.7933922410011292, "geo/layer_14/stable_rank_q_proj": 51.51396179199219, "geo/layer_14/stable_rank_k_proj": 41.750083923339844, "geo/layer_14/stable_rank_o_proj": 42.585025787353516, "geo/layer_14/stable_rank_gate_proj": 72.08378601074219, "geo/layer_14/stable_rank_down_proj": 127.52117919921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3764933943748474, "geo/layer_14/attn_entropy_mean": 5.535767555236816, "geo/layer_14/attn_entropy_std": 0.42620986700057983, "geo/layer_21/stable_rank_q_proj": 39.33034133911133, "geo/layer_21/stable_rank_k_proj": 29.117218017578125, "geo/layer_21/stable_rank_o_proj": 66.08432006835938, "geo/layer_21/stable_rank_gate_proj": 62.08584976196289, "geo/layer_21/stable_rank_down_proj": 49.66018295288086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1393103003501892, "geo/layer_21/attn_entropy_mean": 5.853025436401367, "geo/layer_21/attn_entropy_std": 0.3125304877758026, "geo/layer_27/stable_rank_q_proj": 43.444400787353516, "geo/layer_27/stable_rank_k_proj": 30.617008209228516, "geo/layer_27/stable_rank_o_proj": 108.80087280273438, "geo/layer_27/stable_rank_gate_proj": 73.10772705078125, "geo/layer_27/stable_rank_down_proj": 128.03759765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09937117248773575, "geo/layer_27/attn_entropy_mean": 4.326231956481934, "geo/layer_27/attn_entropy_std": 0.6725144982337952, "attnres/final_alpha/block_0": 0.2530350387096405, "attnres/block_norm/0": 1.7781345844268799, "attnres/final_alpha/block_1": 0.004087450448423624, "attnres/block_norm/1": 49790.70703125, "attnres/final_alpha/block_2": 0.008926749229431152, "attnres/block_norm/2": 29718.662109375, "attnres/final_alpha/block_3": 0.01071283221244812, "attnres/block_norm/3": 69283.234375, "attnres/final_alpha/block_4": 0.012152114883065224, "attnres/block_norm/4": 16833.455078125, "attnres/final_alpha/block_5": 0.6084325313568115, "attnres/block_norm/5": 7020.8828125, "attnres/final_alpha/block_6": 0.10265327990055084, "attnres/block_norm/6": 46027.5546875, "geo/tier1_time_s": 1.3569118976593018, "geo/step": 5475.0, "geo/rankme_slope": 0.001149251419317727} {"step": 5480, "timestamp": 1778331637.3218946, "train/loss": 2.3763718366622926, "train/z_loss": 0.001353485754225403, "train/perplexity": 10.765771941107841, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704108.0896441268, "perf/iters_per_sec": 0.8125820587368616, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.230644941329956, "data/tokens_consumed": 11494490112, "data/tokens_consumed_B": 11.494490112, "train/loss_slope": -4.283963485841351e-06} {"step": 5490, "timestamp": 1778331647.6694534, "train/loss": 2.3852879762649537, "train/z_loss": 0.0013488870463334024, "train/perplexity": 10.862190267543705, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028018.8317420878, "perf/iters_per_sec": 0.9670347365103187, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340890169143677, "data/tokens_consumed": 11515461632, "data/tokens_consumed_B": 11.515461632, "train/loss_slope": -9.111232883465565e-07} {"step": 5500, "timestamp": 1778331658.0123384, "grad/layer_0/attn": 0.0030596754513680935, "grad/layer_0/mlp": 0.003441000822931528, "grad/layer_0/attn_mlp_ratio": 0.889181816540006, "grad/layer_4/attn": 0.0023107000160962343, "grad/layer_4/mlp": 0.0028826117049902678, "grad/layer_4/attn_mlp_ratio": 0.8015994425943994, "grad/layer_8/attn": 0.004818149376660585, "grad/layer_8/mlp": 0.004007116891443729, "grad/layer_8/attn_mlp_ratio": 1.202397980131009, "grad/layer_12/attn": 0.011524646542966366, "grad/layer_12/mlp": 0.007700195536017418, "grad/layer_12/attn_mlp_ratio": 1.4966693169534813, "grad/layer_16/attn": 0.0061401138082146645, "grad/layer_16/mlp": 0.004696182440966368, "grad/layer_16/attn_mlp_ratio": 1.3074691527964262, "grad/layer_20/attn": 0.003141386667266488, "grad/layer_20/mlp": 0.006443988997489214, "grad/layer_20/attn_mlp_ratio": 0.4874909966080597, "grad/layer_24/attn": 0.006593706551939249, "grad/layer_24/mlp": 0.008249955251812935, "grad/layer_24/attn_mlp_ratio": 0.79924148322697, "grad/layer_27/attn": 0.01071486808359623, "grad/layer_27/mlp": 0.008199524134397507, "grad/layer_27/attn_mlp_ratio": 1.3067670485863925} {"step": 5500, "timestamp": 1778331658.0279193, "train/loss": 2.361929988861084, "train/z_loss": 0.0013483659829944372, "train/perplexity": 10.611411608350839, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025858.026719532, "perf/iters_per_sec": 0.9660043843839321, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351919889450074, "data/tokens_consumed": 11536433152, "data/tokens_consumed_B": 11.536433152, "train/loss_slope": -3.7210444591440473e-06} {"step": 5500, "timestamp": 1778331665.0493262, "geo/ww_alpha_mean": 7.54655053703431, "geo/ww_alpha_std": 4.481170967923072, "geo/ww_alpha_min": 1.3450776341350195, "geo/ww_alpha_max": 33.29827870225375, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.897550390186866, "geo/ww_alpha_by_type/k_proj": 4.381002660099037, "geo/ww_alpha_by_type/v_proj": 9.135323649788248, "geo/ww_alpha_by_type/o_proj": 7.7533588185756415, "geo/ww_alpha_by_type/gate_proj": 7.864094711414984, "geo/ww_alpha_by_type/up_proj": 11.749885581199536, "geo/ww_alpha_by_type/down_proj": 8.136498780337067, "geo/twonn_id/layer_0": 0.7280786037445068, "geo/twonn_id/layer_7": 3.5753796100616455, "geo/twonn_id/layer_14": 5.260851860046387, "geo/twonn_id/layer_21": 7.582973957061768, "geo/twonn_id/layer_27": 6.896149635314941, "geo/tier2_time_s": 7.016064405441284} {"step": 5500, "timestamp": 1778331665.8033977, "eoc/jacobian_sigma/layer_0/attn": 1520.62939453125, "eoc/jacobian_sigma/layer_0/mlp": 10944.63671875, "eoc/jacobian_sigma/layer_0": 10944.63671875, "eoc/jacobian_sigma/layer_7/attn": 1.1382927894592285, "eoc/jacobian_sigma/layer_7/mlp": 1.7575634717941284, "eoc/jacobian_sigma/layer_7": 1.7575634717941284, "eoc/jacobian_sigma/layer_14/attn": 1.9275808334350586, "eoc/jacobian_sigma/layer_14/mlp": 11.73361873626709, "eoc/jacobian_sigma/layer_14": 11.73361873626709, "eoc/jacobian_sigma/layer_21/attn": 1.0877751111984253, "eoc/jacobian_sigma/layer_21/mlp": 5.092782974243164, "eoc/jacobian_sigma/layer_21": 5.092782974243164, "eoc/jacobian_sigma/layer_27/attn": 3.59883975982666, "eoc/jacobian_sigma/layer_27/mlp": 27.01008415222168, "eoc/jacobian_sigma/layer_27": 27.01008415222168, "eoc/layer0_sigma": 10944.63671875, "eoc/sigma_max": 27.01008415222168, "eoc/sigma_min": 1.7575634717941284, "eoc/sigma_mean": 11.398512333631516, "eoc/time_s": 0.7456545829772949} {"step": 5510, "timestamp": 1778331676.1736245, "train/loss": 2.408664011955261, "train/z_loss": 0.0013464054558426143, "train/perplexity": 11.119096237964984, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1155996.2184509581, "perf/iters_per_sec": 0.5512219516997138, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8141512632369996, "data/tokens_consumed": 11557404672, "data/tokens_consumed_B": 11.557404672, "train/loss_slope": -3.149010017044766e-06} {"step": 5520, "timestamp": 1778331686.530664, "train/loss": 2.382335138320923, "train/z_loss": 0.0013504587928764521, "train/perplexity": 10.83016328847687, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026264.3613556866, "perf/iters_per_sec": 0.9661981398371156, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349843978881836, "data/tokens_consumed": 11578376192, "data/tokens_consumed_B": 11.578376192, "train/loss_slope": -2.1093305867604315e-06} {"step": 5530, "timestamp": 1778331696.8825047, "train/loss": 2.3865516901016237, "train/z_loss": 0.001360359787940979, "train/perplexity": 10.875925644646472, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026815.8594608395, "perf/iters_per_sec": 0.9664611146263311, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347027778625488, "data/tokens_consumed": 11599347712, "data/tokens_consumed_B": 11.599347712, "train/loss_slope": 7.241536073773674e-07} {"step": 5540, "timestamp": 1778331707.2263653, "train/loss": 2.3157108783721925, "train/z_loss": 0.0013666434912011026, "train/perplexity": 10.132123066377757, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028598.6540284185, "perf/iters_per_sec": 0.9673112173215954, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337934494018555, "data/tokens_consumed": 11620319232, "data/tokens_consumed_B": 11.620319232, "train/loss_slope": -2.703364074963374e-06} {"step": 5550, "timestamp": 1778331717.5680344, "grad/layer_0/attn": 0.003014942863956094, "grad/layer_0/mlp": 0.00321228988468647, "grad/layer_0/attn_mlp_ratio": 0.9385649733768858, "grad/layer_4/attn": 0.0039536538533866405, "grad/layer_4/mlp": 0.002505327807739377, "grad/layer_4/attn_mlp_ratio": 1.5780983563760824, "grad/layer_8/attn": 0.005759314633905888, "grad/layer_8/mlp": 0.003338468261063099, "grad/layer_8/attn_mlp_ratio": 1.725136802576106, "grad/layer_12/attn": 0.0048568700440227985, "grad/layer_12/mlp": 0.006646302994340658, "grad/layer_12/attn_mlp_ratio": 0.730762647306656, "grad/layer_16/attn": 0.006987195927649736, "grad/layer_16/mlp": 0.005042173434048891, "grad/layer_16/attn_mlp_ratio": 1.3857507839558594, "grad/layer_20/attn": 0.003243298502638936, "grad/layer_20/mlp": 0.0069711958058178425, "grad/layer_20/attn_mlp_ratio": 0.4652427713202303, "grad/layer_24/attn": 0.018949590623378754, "grad/layer_24/mlp": 0.015174628235399723, "grad/layer_24/attn_mlp_ratio": 1.248768022816922, "grad/layer_27/attn": 0.006982157938182354, "grad/layer_27/mlp": 0.016971712931990623, "grad/layer_27/attn_mlp_ratio": 0.41139971698917016} {"step": 5550, "timestamp": 1778331718.1874883, "eos/sharpness": 58.81526470184325, "eos/L0_probe": 2.3493809700012207, "eos/L_plus": 2.677123546600342, "eos/L_minus": 2.6097910404205322, "eos/grad_norm": 0.254231333732605, "eos/embed_grad_frac": 0.04550860822200775, "eos/time_s": 0.6165246963500977} {"step": 5550, "timestamp": 1778331718.2070718, "train/loss": 2.384216570854187, "train/z_loss": 0.00135418203426525, "train/perplexity": 10.850558690298344, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911363.647757658, "perf/iters_per_sec": 0.9114092100895205, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0972019910812378, "data/tokens_consumed": 11641290752, "data/tokens_consumed_B": 11.641290752, "train/loss_slope": -5.073581842055393e-07} {"step": 5550, "timestamp": 1778331719.5732768, "geo/rankme_last": 427.4388427734375, "geo/layer_0/stable_rank_q_proj": 20.7076416015625, "geo/layer_0/stable_rank_k_proj": 17.274986267089844, "geo/layer_0/stable_rank_o_proj": 45.192718505859375, "geo/layer_0/stable_rank_gate_proj": 128.83740234375, "geo/layer_0/stable_rank_down_proj": 56.475364685058594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06564192473888397, "geo/layer_0/attn_entropy_mean": 6.25253963470459, "geo/layer_0/attn_entropy_std": 0.43161386251449585, "geo/layer_7/stable_rank_q_proj": 42.433231353759766, "geo/layer_7/stable_rank_k_proj": 39.28706741333008, "geo/layer_7/stable_rank_o_proj": 90.07951354980469, "geo/layer_7/stable_rank_gate_proj": 79.39456939697266, "geo/layer_7/stable_rank_down_proj": 143.71714782714844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4146754741668701, "geo/layer_7/attn_entropy_mean": 4.714084625244141, "geo/layer_7/attn_entropy_std": 0.7574917674064636, "geo/layer_14/stable_rank_q_proj": 51.53394317626953, "geo/layer_14/stable_rank_k_proj": 41.788021087646484, "geo/layer_14/stable_rank_o_proj": 42.640342712402344, "geo/layer_14/stable_rank_gate_proj": 72.06793212890625, "geo/layer_14/stable_rank_down_proj": 127.37555694580078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37808915972709656, "geo/layer_14/attn_entropy_mean": 5.522335052490234, "geo/layer_14/attn_entropy_std": 0.47575443983078003, "geo/layer_21/stable_rank_q_proj": 39.37286376953125, "geo/layer_21/stable_rank_k_proj": 29.093015670776367, "geo/layer_21/stable_rank_o_proj": 66.09585571289062, "geo/layer_21/stable_rank_gate_proj": 62.141788482666016, "geo/layer_21/stable_rank_down_proj": 49.652000427246094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13883166015148163, "geo/layer_21/attn_entropy_mean": 5.880723476409912, "geo/layer_21/attn_entropy_std": 0.3125351071357727, "geo/layer_27/stable_rank_q_proj": 43.39640426635742, "geo/layer_27/stable_rank_k_proj": 30.597333908081055, "geo/layer_27/stable_rank_o_proj": 108.94635772705078, "geo/layer_27/stable_rank_gate_proj": 73.03121948242188, "geo/layer_27/stable_rank_down_proj": 128.1122589111328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09858305007219315, "geo/layer_27/attn_entropy_mean": 4.323884010314941, "geo/layer_27/attn_entropy_std": 0.6581077575683594, "attnres/final_alpha/block_0": 0.2502261996269226, "attnres/block_norm/0": 1.7778372764587402, "attnres/final_alpha/block_1": 0.004012412391602993, "attnres/block_norm/1": 49804.3828125, "attnres/final_alpha/block_2": 0.008525745943188667, "attnres/block_norm/2": 29716.80859375, "attnres/final_alpha/block_3": 0.010628441348671913, "attnres/block_norm/3": 69243.859375, "attnres/final_alpha/block_4": 0.01180748175829649, "attnres/block_norm/4": 16854.078125, "attnres/final_alpha/block_5": 0.6148678064346313, "attnres/block_norm/5": 7011.029296875, "attnres/final_alpha/block_6": 0.09993190318346024, "attnres/block_norm/6": 46477.34375, "geo/tier1_time_s": 1.3623466491699219, "geo/step": 5550.0, "geo/rankme_slope": 0.0011427495216836735} {"step": 5560, "timestamp": 1778331729.9196553, "train/loss": 2.3648608207702635, "train/z_loss": 0.0013521032640710473, "train/perplexity": 10.642557491473402, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791088.0429599204, "perf/iters_per_sec": 0.8540573324966051, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1708815813064575, "data/tokens_consumed": 11662262272, "data/tokens_consumed_B": 11.662262272, "train/loss_slope": -4.260389303395668e-06} {"step": 5570, "timestamp": 1778331740.2817044, "train/loss": 2.3740387916564942, "train/z_loss": 0.0013434894965030253, "train/perplexity": 10.74068418745585, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025446.3049312164, "perf/iters_per_sec": 0.9658080601364214, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354024171829224, "data/tokens_consumed": 11683233792, "data/tokens_consumed_B": 11.683233792, "train/loss_slope": -1.3880132520087836e-06} {"step": 5580, "timestamp": 1778331750.6486938, "train/loss": 2.3642292499542235, "train/z_loss": 0.001344918494578451, "train/perplexity": 10.63583808486767, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024568.7012740884, "perf/iters_per_sec": 0.9653895861025278, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035851240158081, "data/tokens_consumed": 11704205312, "data/tokens_consumed_B": 11.704205312, "train/loss_slope": 7.195969154409205e-08} {"step": 5590, "timestamp": 1778331761.010187, "train/loss": 2.374973917007446, "train/z_loss": 0.0013440293725579977, "train/perplexity": 10.750732771136523, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025440.3351234742, "perf/iters_per_sec": 0.9658052135102626, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354054689407348, "data/tokens_consumed": 11725176832, "data/tokens_consumed_B": 11.725176832, "train/loss_slope": 2.443427478733977e-07} {"step": 5600, "timestamp": 1778331771.3532853, "grad/layer_0/attn": 0.00385400652885437, "grad/layer_0/mlp": 0.00355478934943676, "grad/layer_0/attn_mlp_ratio": 1.084172940106205, "grad/layer_4/attn": 0.0033295825123786926, "grad/layer_4/mlp": 0.002655952237546444, "grad/layer_4/attn_mlp_ratio": 1.2536303702853888, "grad/layer_8/attn": 0.004532069433480501, "grad/layer_8/mlp": 0.00340220401994884, "grad/layer_8/attn_mlp_ratio": 1.3320980381237828, "grad/layer_12/attn": 0.006204521749168634, "grad/layer_12/mlp": 0.007938089780509472, "grad/layer_12/attn_mlp_ratio": 0.7816139452392323, "grad/layer_16/attn": 0.0049463436007499695, "grad/layer_16/mlp": 0.005046583712100983, "grad/layer_16/attn_mlp_ratio": 0.9801370164286877, "grad/layer_20/attn": 0.004059416241943836, "grad/layer_20/mlp": 0.0058450596407055855, "grad/layer_20/attn_mlp_ratio": 0.694503806979705, "grad/layer_24/attn": 0.011437132954597473, "grad/layer_24/mlp": 0.00948273204267025, "grad/layer_24/attn_mlp_ratio": 1.2061010247387292, "grad/layer_27/attn": 0.004290993325412273, "grad/layer_27/mlp": 0.009093957021832466, "grad/layer_27/attn_mlp_ratio": 0.47185106196625903} {"step": 5600, "timestamp": 1778331771.3694017, "train/loss": 2.329523491859436, "train/z_loss": 0.0013497459120117127, "train/perplexity": 10.273045176904137, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025676.1692906641, "perf/iters_per_sec": 0.9659176680043526, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352849245071412, "data/tokens_consumed": 11746148352, "data/tokens_consumed_B": 11.746148352, "train/loss_slope": -1.2722658030879074e-06} {"step": 5610, "timestamp": 1778331781.7237778, "train/loss": 2.350049138069153, "train/z_loss": 0.0013467733631841838, "train/perplexity": 10.486084978036988, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026769.4383838882, "perf/iters_per_sec": 0.9664389793319169, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347264766693116, "data/tokens_consumed": 11767119872, "data/tokens_consumed_B": 11.767119872, "train/loss_slope": -2.007579946532279e-06} {"step": 5620, "timestamp": 1778331792.0769432, "train/loss": 2.3803789138793947, "train/z_loss": 0.0013462067348882557, "train/perplexity": 10.80899776735079, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026816.419889246, "perf/iters_per_sec": 0.9664613818594199, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347024917602539, "data/tokens_consumed": 11788091392, "data/tokens_consumed_B": 11.788091392, "train/loss_slope": -1.9326739364154517e-07} {"step": 5625, "timestamp": 1778331797.8462403, "eos/sharpness": 29.028367996215813, "eos/L0_probe": 2.345262289047241, "eos/L_plus": 2.4677748680114746, "eos/L_minus": 2.513033390045166, "eos/grad_norm": 0.1289404034614563, "eos/embed_grad_frac": 0.14854910969734192, "eos/time_s": 0.6012375354766846} {"step": 5625, "timestamp": 1778331799.2243836, "geo/rankme_last": 427.381591796875, "geo/layer_0/stable_rank_q_proj": 20.704538345336914, "geo/layer_0/stable_rank_k_proj": 17.257925033569336, "geo/layer_0/stable_rank_o_proj": 45.259334564208984, "geo/layer_0/stable_rank_gate_proj": 128.78224182128906, "geo/layer_0/stable_rank_down_proj": 56.4967041015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06142483651638031, "geo/layer_0/attn_entropy_mean": 6.2554216384887695, "geo/layer_0/attn_entropy_std": 0.43655550479888916, "geo/layer_7/stable_rank_q_proj": 42.43254089355469, "geo/layer_7/stable_rank_k_proj": 39.296104431152344, "geo/layer_7/stable_rank_o_proj": 90.07501983642578, "geo/layer_7/stable_rank_gate_proj": 79.35723876953125, "geo/layer_7/stable_rank_down_proj": 143.5165252685547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40893781185150146, "geo/layer_7/attn_entropy_mean": 4.7310075759887695, "geo/layer_7/attn_entropy_std": 0.7713019251823425, "geo/layer_14/stable_rank_q_proj": 51.60280990600586, "geo/layer_14/stable_rank_k_proj": 41.814483642578125, "geo/layer_14/stable_rank_o_proj": 42.69780349731445, "geo/layer_14/stable_rank_gate_proj": 72.0601577758789, "geo/layer_14/stable_rank_down_proj": 127.36679077148438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.35848522186279297, "geo/layer_14/attn_entropy_mean": 5.52117919921875, "geo/layer_14/attn_entropy_std": 0.4295218288898468, "geo/layer_21/stable_rank_q_proj": 39.34465789794922, "geo/layer_21/stable_rank_k_proj": 29.153114318847656, "geo/layer_21/stable_rank_o_proj": 66.00366973876953, "geo/layer_21/stable_rank_gate_proj": 62.22645568847656, "geo/layer_21/stable_rank_down_proj": 49.58412170410156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13999933004379272, "geo/layer_21/attn_entropy_mean": 5.871509552001953, "geo/layer_21/attn_entropy_std": 0.3231639564037323, "geo/layer_27/stable_rank_q_proj": 43.308162689208984, "geo/layer_27/stable_rank_k_proj": 30.551986694335938, "geo/layer_27/stable_rank_o_proj": 109.10551452636719, "geo/layer_27/stable_rank_gate_proj": 72.95516204833984, "geo/layer_27/stable_rank_down_proj": 128.1078643798828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10406778007745743, "geo/layer_27/attn_entropy_mean": 4.316285133361816, "geo/layer_27/attn_entropy_std": 0.6647972464561462, "attnres/final_alpha/block_0": 0.2510724663734436, "attnres/block_norm/0": 1.7776556015014648, "attnres/final_alpha/block_1": 0.004104660823941231, "attnres/block_norm/1": 49881.8515625, "attnres/final_alpha/block_2": 0.008635911159217358, "attnres/block_norm/2": 29799.2890625, "attnres/final_alpha/block_3": 0.010596778243780136, "attnres/block_norm/3": 69590.671875, "attnres/final_alpha/block_4": 0.01198851689696312, "attnres/block_norm/4": 16879.525390625, "attnres/final_alpha/block_5": 0.6105613112449646, "attnres/block_norm/5": 7024.21337890625, "attnres/final_alpha/block_6": 0.10304033756256104, "attnres/block_norm/6": 46162.515625, "geo/tier1_time_s": 1.3572566509246826, "geo/step": 5625.0, "geo/rankme_slope": 0.001094994013230292} {"step": 5630, "timestamp": 1778331804.4030592, "train/loss": 2.365809988975525, "train/z_loss": 0.001343873015139252, "train/perplexity": 10.652663864232071, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702449.5392805303, "perf/iters_per_sec": 0.8117912002947475, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.231843852996826, "data/tokens_consumed": 11809062912, "data/tokens_consumed_B": 11.809062912, "train/loss_slope": 2.673995398273324e-06} {"step": 5640, "timestamp": 1778331814.7513285, "train/loss": 2.3308544158935547, "train/z_loss": 0.0013545737252570688, "train/perplexity": 10.286726922295946, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027352.0508947568, "perf/iters_per_sec": 0.9667167906259331, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034429121017456, "data/tokens_consumed": 11830034432, "data/tokens_consumed_B": 11.830034432, "train/loss_slope": 1.4337981888169683e-06} {"step": 5650, "timestamp": 1778331825.0958765, "grad/layer_0/attn": 0.0029194201342761517, "grad/layer_0/mlp": 0.003411714918911457, "grad/layer_0/attn_mlp_ratio": 0.8557045703095163, "grad/layer_4/attn": 0.0020711845718324184, "grad/layer_4/mlp": 0.0026525361463427544, "grad/layer_4/attn_mlp_ratio": 0.7808317698535173, "grad/layer_8/attn": 0.014007773250341415, "grad/layer_8/mlp": 0.0036713629961013794, "grad/layer_8/attn_mlp_ratio": 3.8154148428457733, "grad/layer_12/attn": 0.006450476124882698, "grad/layer_12/mlp": 0.006971714552491903, "grad/layer_12/attn_mlp_ratio": 0.9252352464794441, "grad/layer_16/attn": 0.0031798870768398046, "grad/layer_16/mlp": 0.004463742487132549, "grad/layer_16/attn_mlp_ratio": 0.7123813738736497, "grad/layer_20/attn": 0.0027498204726725817, "grad/layer_20/mlp": 0.006071705371141434, "grad/layer_20/attn_mlp_ratio": 0.45289095226083764, "grad/layer_24/attn": 0.014245869591832161, "grad/layer_24/mlp": 0.013016796670854092, "grad/layer_24/attn_mlp_ratio": 1.094422064246258, "grad/layer_27/attn": 0.008821739815175533, "grad/layer_27/mlp": 0.013364771381020546, "grad/layer_27/attn_mlp_ratio": 0.66007412305578} {"step": 5650, "timestamp": 1778331825.111749, "train/loss": 2.338768410682678, "train/z_loss": 0.0013599154306575656, "train/perplexity": 10.368459012642782, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025566.967885892, "perf/iters_per_sec": 0.9658655967168293, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353407382965087, "data/tokens_consumed": 11851005952, "data/tokens_consumed_B": 11.851005952, "train/loss_slope": 8.668535863748376e-07} {"step": 5660, "timestamp": 1778331835.4651814, "train/loss": 2.3970706462860107, "train/z_loss": 0.0013625811785459518, "train/perplexity": 10.990932847385073, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026577.6120578896, "perf/iters_per_sec": 0.9663475094117592, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348244190216065, "data/tokens_consumed": 11871977472, "data/tokens_consumed_B": 11.871977472, "train/loss_slope": 3.4688982633080385e-06} {"step": 5670, "timestamp": 1778331845.8194137, "train/loss": 2.355262875556946, "train/z_loss": 0.0013639172189868986, "train/perplexity": 10.54089944233217, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026809.274450277, "perf/iters_per_sec": 0.9664579746486077, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034706139564514, "data/tokens_consumed": 11892948992, "data/tokens_consumed_B": 11.892948992, "train/loss_slope": 5.719936927660315e-06} {"step": 5680, "timestamp": 1778331856.173337, "train/loss": 2.3675803422927855, "train/z_loss": 0.0013635148643516004, "train/perplexity": 10.67153954642513, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026878.9096004514, "perf/iters_per_sec": 0.9664911792757279, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346705913543701, "data/tokens_consumed": 11913920512, "data/tokens_consumed_B": 11.913920512, "train/loss_slope": 7.588075654412926e-06} {"step": 5690, "timestamp": 1778331866.5273914, "train/loss": 2.355861043930054, "train/z_loss": 0.0013654708978720008, "train/perplexity": 10.547206561174132, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026466.6794224898, "perf/iters_per_sec": 0.9662946126091432, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034881067276001, "data/tokens_consumed": 11934892032, "data/tokens_consumed_B": 11.934892032, "train/loss_slope": 6.7523856153009965e-06} {"step": 5700, "timestamp": 1778331876.863744, "grad/layer_0/attn": 0.003273449605330825, "grad/layer_0/mlp": 0.0034939951729029417, "grad/layer_0/attn_mlp_ratio": 0.9368786588572344, "grad/layer_4/attn": 0.00208050268702209, "grad/layer_4/mlp": 0.002766683930531144, "grad/layer_4/attn_mlp_ratio": 0.7519842034952859, "grad/layer_8/attn": 0.004127131775021553, "grad/layer_8/mlp": 0.003881545504555106, "grad/layer_8/attn_mlp_ratio": 1.0632702009679453, "grad/layer_12/attn": 0.007179447449743748, "grad/layer_12/mlp": 0.00743858003988862, "grad/layer_12/attn_mlp_ratio": 0.9651636891353365, "grad/layer_16/attn": 0.004133538343012333, "grad/layer_16/mlp": 0.005841466132551432, "grad/layer_16/attn_mlp_ratio": 0.7076200012897942, "grad/layer_20/attn": 0.004499610047787428, "grad/layer_20/mlp": 0.008571961894631386, "grad/layer_20/attn_mlp_ratio": 0.5249218382682437, "grad/layer_24/attn": 0.021561410278081894, "grad/layer_24/mlp": 0.01718534715473652, "grad/layer_24/attn_mlp_ratio": 1.2546391968971864, "grad/layer_27/attn": 0.018505647778511047, "grad/layer_27/mlp": 0.017178861424326897, "grad/layer_27/attn_mlp_ratio": 1.0772336544133203} {"step": 5700, "timestamp": 1778331877.4714153, "eos/sharpness": 78.88648509979247, "eos/L0_probe": 2.3451075553894043, "eos/L_plus": 2.847938299179077, "eos/L_minus": 2.6311416625976562, "eos/grad_norm": 0.37344419956207275, "eos/embed_grad_frac": 0.016749488189816475, "eos/time_s": 0.6047694683074951} {"step": 5700, "timestamp": 1778331877.4931788, "train/loss": 2.368318057060242, "train/z_loss": 0.001340307341888547, "train/perplexity": 10.679415003303275, "train/grad_norm": 0.373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913675.2366925376, "perf/iters_per_sec": 0.9125114615881622, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.09587664604187, "data/tokens_consumed": 11955863552, "data/tokens_consumed_B": 11.955863552, "train/loss_slope": 4.575698882868899e-06} {"step": 5700, "timestamp": 1778331878.8574157, "geo/rankme_last": 427.60174560546875, "geo/layer_0/stable_rank_q_proj": 20.704345703125, "geo/layer_0/stable_rank_k_proj": 17.24350357055664, "geo/layer_0/stable_rank_o_proj": 45.25129318237305, "geo/layer_0/stable_rank_gate_proj": 128.87649536132812, "geo/layer_0/stable_rank_down_proj": 56.42219543457031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0669412910938263, "geo/layer_0/attn_entropy_mean": 6.255798816680908, "geo/layer_0/attn_entropy_std": 0.433710515499115, "geo/layer_7/stable_rank_q_proj": 42.35279846191406, "geo/layer_7/stable_rank_k_proj": 39.267494201660156, "geo/layer_7/stable_rank_o_proj": 90.04729461669922, "geo/layer_7/stable_rank_gate_proj": 79.26051330566406, "geo/layer_7/stable_rank_down_proj": 143.0970001220703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40169069170951843, "geo/layer_7/attn_entropy_mean": 4.7217864990234375, "geo/layer_7/attn_entropy_std": 0.7695021629333496, "geo/layer_14/stable_rank_q_proj": 51.64045333862305, "geo/layer_14/stable_rank_k_proj": 41.9156379699707, "geo/layer_14/stable_rank_o_proj": 42.6953125, "geo/layer_14/stable_rank_gate_proj": 72.05606079101562, "geo/layer_14/stable_rank_down_proj": 127.14595031738281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3718854784965515, "geo/layer_14/attn_entropy_mean": 5.545132637023926, "geo/layer_14/attn_entropy_std": 0.4464992880821228, "geo/layer_21/stable_rank_q_proj": 39.33346939086914, "geo/layer_21/stable_rank_k_proj": 29.133485794067383, "geo/layer_21/stable_rank_o_proj": 65.92460632324219, "geo/layer_21/stable_rank_gate_proj": 62.21242141723633, "geo/layer_21/stable_rank_down_proj": 49.555110931396484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14174510538578033, "geo/layer_21/attn_entropy_mean": 5.8965229988098145, "geo/layer_21/attn_entropy_std": 0.3161185383796692, "geo/layer_27/stable_rank_q_proj": 43.32486343383789, "geo/layer_27/stable_rank_k_proj": 30.556352615356445, "geo/layer_27/stable_rank_o_proj": 109.16744995117188, "geo/layer_27/stable_rank_gate_proj": 72.8927993774414, "geo/layer_27/stable_rank_down_proj": 128.02281188964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10687649250030518, "geo/layer_27/attn_entropy_mean": 4.331523895263672, "geo/layer_27/attn_entropy_std": 0.654095470905304, "attnres/final_alpha/block_0": 0.24870598316192627, "attnres/block_norm/0": 1.7775228023529053, "attnres/final_alpha/block_1": 0.00397108681499958, "attnres/block_norm/1": 49965.515625, "attnres/final_alpha/block_2": 0.008451541885733604, "attnres/block_norm/2": 29790.923828125, "attnres/final_alpha/block_3": 0.010256180539727211, "attnres/block_norm/3": 70151.4453125, "attnres/final_alpha/block_4": 0.011613180860877037, "attnres/block_norm/4": 16876.703125, "attnres/final_alpha/block_5": 0.6158795356750488, "attnres/block_norm/5": 6928.30517578125, "attnres/final_alpha/block_6": 0.10112252086400986, "attnres/block_norm/6": 46207.890625, "geo/tier1_time_s": 1.359922170639038, "geo/step": 5700.0, "geo/rankme_slope": 0.0010946300395158063} {"step": 5710, "timestamp": 1778331889.223739, "train/loss": 2.3668907403945925, "train/z_loss": 0.0013441086746752262, "train/perplexity": 10.66418296934357, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788408.6891523812, "perf/iters_per_sec": 0.8527797170411974, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1726357698440553, "data/tokens_consumed": 11976835072, "data/tokens_consumed_B": 11.976835072, "train/loss_slope": 2.485290533638605e-06} {"step": 5720, "timestamp": 1778331899.6054723, "train/loss": 2.3899534702301026, "train/z_loss": 0.0013495572144165634, "train/perplexity": 10.912986152493808, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021477.9582884063, "perf/iters_per_sec": 0.9639158050004989, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374350070953369, "data/tokens_consumed": 11997806592, "data/tokens_consumed_B": 11.997806592, "train/loss_slope": 3.549955401233149e-06} {"step": 5730, "timestamp": 1778331909.9717548, "train/loss": 2.3313032388687134, "train/z_loss": 0.0013386916951276362, "train/perplexity": 10.291344877922628, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025079.786680038, "perf/iters_per_sec": 0.9656332906151, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355898141860962, "data/tokens_consumed": 12018778112, "data/tokens_consumed_B": 12.018778112, "train/loss_slope": 1.1355346865098271e-07} {"step": 5740, "timestamp": 1778331920.3277419, "train/loss": 2.371799039840698, "train/z_loss": 0.0013535946258343757, "train/perplexity": 10.71665464069992, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026628.0866918552, "perf/iters_per_sec": 0.9663715775927807, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347986459732055, "data/tokens_consumed": 12039749632, "data/tokens_consumed_B": 12.039749632, "train/loss_slope": 1.9754756008437524e-06} {"step": 5750, "timestamp": 1778331930.67118, "grad/layer_0/attn": 0.0027537664864212275, "grad/layer_0/mlp": 0.0031051030382514, "grad/layer_0/attn_mlp_ratio": 0.8868518576719409, "grad/layer_4/attn": 0.003313404740765691, "grad/layer_4/mlp": 0.002666183514520526, "grad/layer_4/attn_mlp_ratio": 1.2427518955259793, "grad/layer_8/attn": 0.003256799653172493, "grad/layer_8/mlp": 0.0034402022138237953, "grad/layer_8/attn_mlp_ratio": 0.9466884084362324, "grad/layer_12/attn": 0.007386144250631332, "grad/layer_12/mlp": 0.0072199031710624695, "grad/layer_12/attn_mlp_ratio": 1.0230253749015117, "grad/layer_16/attn": 0.003828703658655286, "grad/layer_16/mlp": 0.004587338771671057, "grad/layer_16/attn_mlp_ratio": 0.8346241177645073, "grad/layer_20/attn": 0.0034886300563812256, "grad/layer_20/mlp": 0.005641983821988106, "grad/layer_20/attn_mlp_ratio": 0.6183339237790512, "grad/layer_24/attn": 0.005353427026420832, "grad/layer_24/mlp": 0.008916105143725872, "grad/layer_24/attn_mlp_ratio": 0.6004221439835468, "grad/layer_27/attn": 0.007896781899034977, "grad/layer_27/mlp": 0.008489962667226791, "grad/layer_27/attn_mlp_ratio": 0.9301315112379964} {"step": 5750, "timestamp": 1778331930.6871974, "train/loss": 2.3707664966583253, "train/z_loss": 0.0013566980487667024, "train/perplexity": 10.705594942803257, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025350.325881546, "perf/iters_per_sec": 0.9657622937591295, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354514837265014, "data/tokens_consumed": 12060721152, "data/tokens_consumed_B": 12.060721152, "train/loss_slope": 1.1165355369917894e-06} {"step": 5760, "timestamp": 1778331941.0355654, "train/loss": 2.3409228563308715, "train/z_loss": 0.0013520643580704928, "train/perplexity": 10.390821374637957, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027981.2391604385, "perf/iters_per_sec": 0.9670168109705155, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341081857681274, "data/tokens_consumed": 12081692672, "data/tokens_consumed_B": 12.081692672, "train/loss_slope": -8.20525072374199e-07} {"step": 5770, "timestamp": 1778331951.3828, "train/loss": 2.3313798189163206, "train/z_loss": 0.0013535337173379957, "train/perplexity": 10.292133019780906, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028070.921279286, "perf/iters_per_sec": 0.9670595747372084, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340624570846557, "data/tokens_consumed": 12102664192, "data/tokens_consumed_B": 12.102664192, "train/loss_slope": -4.323717541355612e-06} {"step": 5775, "timestamp": 1778331957.168667, "eos/sharpness": 68.55490207672118, "eos/L0_probe": 2.3441221714019775, "eos/L_plus": 2.6503512859344482, "eos/L_minus": 2.7234420776367188, "eos/grad_norm": 0.24738657474517822, "eos/embed_grad_frac": 0.0382794626057148, "eos/time_s": 0.5966231822967529} {"step": 5775, "timestamp": 1778331958.5541358, "geo/rankme_last": 427.8289489746094, "geo/layer_0/stable_rank_q_proj": 20.71263885498047, "geo/layer_0/stable_rank_k_proj": 17.228782653808594, "geo/layer_0/stable_rank_o_proj": 45.23747253417969, "geo/layer_0/stable_rank_gate_proj": 128.7467803955078, "geo/layer_0/stable_rank_down_proj": 56.421653747558594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06254057586193085, "geo/layer_0/attn_entropy_mean": 6.262027740478516, "geo/layer_0/attn_entropy_std": 0.43291422724723816, "geo/layer_7/stable_rank_q_proj": 42.44248962402344, "geo/layer_7/stable_rank_k_proj": 39.22039794921875, "geo/layer_7/stable_rank_o_proj": 89.93523406982422, "geo/layer_7/stable_rank_gate_proj": 79.32957458496094, "geo/layer_7/stable_rank_down_proj": 143.3661346435547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4089997112751007, "geo/layer_7/attn_entropy_mean": 4.739615440368652, "geo/layer_7/attn_entropy_std": 0.7499396800994873, "geo/layer_14/stable_rank_q_proj": 51.62709045410156, "geo/layer_14/stable_rank_k_proj": 41.774391174316406, "geo/layer_14/stable_rank_o_proj": 42.704017639160156, "geo/layer_14/stable_rank_gate_proj": 71.91033935546875, "geo/layer_14/stable_rank_down_proj": 127.3458251953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3672698438167572, "geo/layer_14/attn_entropy_mean": 5.519848823547363, "geo/layer_14/attn_entropy_std": 0.47183099389076233, "geo/layer_21/stable_rank_q_proj": 39.29492950439453, "geo/layer_21/stable_rank_k_proj": 29.03067398071289, "geo/layer_21/stable_rank_o_proj": 65.99783325195312, "geo/layer_21/stable_rank_gate_proj": 62.070465087890625, "geo/layer_21/stable_rank_down_proj": 49.523712158203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13816937804222107, "geo/layer_21/attn_entropy_mean": 5.866016387939453, "geo/layer_21/attn_entropy_std": 0.3164367377758026, "geo/layer_27/stable_rank_q_proj": 43.30231857299805, "geo/layer_27/stable_rank_k_proj": 30.458765029907227, "geo/layer_27/stable_rank_o_proj": 109.21699523925781, "geo/layer_27/stable_rank_gate_proj": 72.8716049194336, "geo/layer_27/stable_rank_down_proj": 127.96507263183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10243435204029083, "geo/layer_27/attn_entropy_mean": 4.338508129119873, "geo/layer_27/attn_entropy_std": 0.6689169406890869, "attnres/final_alpha/block_0": 0.2527252733707428, "attnres/block_norm/0": 1.777707815170288, "attnres/final_alpha/block_1": 0.004054035060107708, "attnres/block_norm/1": 50018.82421875, "attnres/final_alpha/block_2": 0.00882505439221859, "attnres/block_norm/2": 29684.109375, "attnres/final_alpha/block_3": 0.010538533329963684, "attnres/block_norm/3": 69153.9921875, "attnres/final_alpha/block_4": 0.012152253650128841, "attnres/block_norm/4": 16933.52734375, "attnres/final_alpha/block_5": 0.6083770990371704, "attnres/block_norm/5": 7080.50390625, "attnres/final_alpha/block_6": 0.10332774370908737, "attnres/block_norm/6": 46098.6015625, "geo/tier1_time_s": 1.3651878833770752, "geo/step": 5775.0, "geo/rankme_slope": 0.0010376388055222088} {"step": 5780, "timestamp": 1778331963.7428784, "train/loss": 2.415428566932678, "train/z_loss": 0.0013452153536491096, "train/perplexity": 11.19456695086944, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697364.8963079443, "perf/iters_per_sec": 0.8093666535892221, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2355339765548705, "data/tokens_consumed": 12123635712, "data/tokens_consumed_B": 12.123635712, "train/loss_slope": -1.4680233511022594e-06} {"step": 5790, "timestamp": 1778331974.1261628, "train/loss": 2.365110993385315, "train/z_loss": 0.0013577991980127991, "train/perplexity": 10.645220300979, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021145.847569899, "perf/iters_per_sec": 0.9637574422692771, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376054763793945, "data/tokens_consumed": 12144607232, "data/tokens_consumed_B": 12.144607232, "train/loss_slope": -5.711025137319699e-07} {"step": 5800, "timestamp": 1778331984.49596, "grad/layer_0/attn": 0.0029156638775020838, "grad/layer_0/mlp": 0.0031733487267047167, "grad/layer_0/attn_mlp_ratio": 0.9187971561669698, "grad/layer_4/attn": 0.0025796096306294203, "grad/layer_4/mlp": 0.002667035674676299, "grad/layer_4/attn_mlp_ratio": 0.9672197332795471, "grad/layer_8/attn": 0.0074229128658771515, "grad/layer_8/mlp": 0.0036190750543028116, "grad/layer_8/attn_mlp_ratio": 2.0510524234490846, "grad/layer_12/attn": 0.0100067388266325, "grad/layer_12/mlp": 0.0073412577621638775, "grad/layer_12/attn_mlp_ratio": 1.3630823238352994, "grad/layer_16/attn": 0.008619542233645916, "grad/layer_16/mlp": 0.004532295279204845, "grad/layer_16/attn_mlp_ratio": 1.9018050485399178, "grad/layer_20/attn": 0.0034696084912866354, "grad/layer_20/mlp": 0.005922453477978706, "grad/layer_20/attn_mlp_ratio": 0.5858397107893903, "grad/layer_24/attn": 0.013300715014338493, "grad/layer_24/mlp": 0.011367874220013618, "grad/layer_24/attn_mlp_ratio": 1.1700265713636566, "grad/layer_27/attn": 0.005336729343980551, "grad/layer_27/mlp": 0.011102796532213688, "grad/layer_27/attn_mlp_ratio": 0.4806653243108631} {"step": 5800, "timestamp": 1778331984.5125182, "train/loss": 2.387248730659485, "train/z_loss": 0.001337043452076614, "train/perplexity": 10.883509248657807, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020527.576362604, "perf/iters_per_sec": 0.9634626275837918, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037922978401184, "data/tokens_consumed": 12165578752, "data/tokens_consumed_B": 12.165578752, "train/loss_slope": 1.2667044912269464e-06} {"step": 5810, "timestamp": 1778331994.8661773, "train/loss": 2.3787885665893556, "train/z_loss": 0.001348700118251145, "train/perplexity": 10.79182136888304, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027022.0713333234, "perf/iters_per_sec": 0.966559444109594, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345975160598755, "data/tokens_consumed": 12186550272, "data/tokens_consumed_B": 12.186550272, "train/loss_slope": 2.6609789026845033e-06} {"step": 5820, "timestamp": 1778332005.212155, "train/loss": 2.3498242139816283, "train/z_loss": 0.0013534235185943544, "train/perplexity": 10.483726670171663, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028556.5955903714, "perf/iters_per_sec": 0.9672911622955186, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338148832321168, "data/tokens_consumed": 12207521792, "data/tokens_consumed_B": 12.207521792, "train/loss_slope": 2.994879151191712e-06} {"step": 5830, "timestamp": 1778332015.5559723, "train/loss": 2.346358561515808, "train/z_loss": 0.001347622147295624, "train/perplexity": 10.447456603012581, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028428.5596695293, "perf/iters_per_sec": 0.9672301100108763, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338801383972167, "data/tokens_consumed": 12228493312, "data/tokens_consumed_B": 12.228493312, "train/loss_slope": 1.7828949356879505e-06} {"step": 5840, "timestamp": 1778332025.9038858, "train/loss": 2.3768282175064086, "train/z_loss": 0.0013609296991489827, "train/perplexity": 10.770686354530659, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028044.7358991916, "perf/iters_per_sec": 0.9670470885749777, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340758085250854, "data/tokens_consumed": 12249464832, "data/tokens_consumed_B": 12.249464832, "train/loss_slope": 2.3160753184311006e-06} {"step": 5850, "timestamp": 1778332036.2389627, "grad/layer_0/attn": 0.002494072774425149, "grad/layer_0/mlp": 0.002930526854470372, "grad/layer_0/attn_mlp_ratio": 0.8510663144116678, "grad/layer_4/attn": 0.0024440186098217964, "grad/layer_4/mlp": 0.0025613531470298767, "grad/layer_4/attn_mlp_ratio": 0.9541903728647579, "grad/layer_8/attn": 0.004994980059564114, "grad/layer_8/mlp": 0.0034730725456029177, "grad/layer_8/attn_mlp_ratio": 1.4382020099372255, "grad/layer_12/attn": 0.007043916266411543, "grad/layer_12/mlp": 0.006852956488728523, "grad/layer_12/attn_mlp_ratio": 1.027865298022915, "grad/layer_16/attn": 0.00464432779699564, "grad/layer_16/mlp": 0.00426393561065197, "grad/layer_16/attn_mlp_ratio": 1.0892114966446123, "grad/layer_20/attn": 0.0031812607776373625, "grad/layer_20/mlp": 0.005201483145356178, "grad/layer_20/attn_mlp_ratio": 0.6116064644594509, "grad/layer_24/attn": 0.00463997945189476, "grad/layer_24/mlp": 0.007469589356333017, "grad/layer_24/attn_mlp_ratio": 0.6211826605759169, "grad/layer_27/attn": 0.006959898862987757, "grad/layer_27/mlp": 0.006929637398570776, "grad/layer_27/attn_mlp_ratio": 1.0043669476827932} {"step": 5850, "timestamp": 1778332036.8392143, "eos/sharpness": 34.128093719482415, "eos/L0_probe": 2.343379020690918, "eos/L_plus": 2.488867998123169, "eos/L_minus": 2.539170980453491, "eos/grad_norm": 0.10739859193563461, "eos/embed_grad_frac": 0.18775098025798798, "eos/time_s": 0.5975534915924072} {"step": 5850, "timestamp": 1778332036.8583283, "train/loss": 2.372781825065613, "train/z_loss": 0.001358094927854836, "train/perplexity": 10.72719198766763, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915578.1289564187, "perf/iters_per_sec": 0.913418831327638, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0947880268096923, "data/tokens_consumed": 12270436352, "data/tokens_consumed_B": 12.270436352, "train/loss_slope": 2.1962332646838036e-06} {"step": 5850, "timestamp": 1778332038.2230127, "geo/rankme_last": 428.8571472167969, "geo/layer_0/stable_rank_q_proj": 20.717002868652344, "geo/layer_0/stable_rank_k_proj": 17.22819709777832, "geo/layer_0/stable_rank_o_proj": 45.15370178222656, "geo/layer_0/stable_rank_gate_proj": 128.62384033203125, "geo/layer_0/stable_rank_down_proj": 56.39871597290039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061408378183841705, "geo/layer_0/attn_entropy_mean": 6.2586469650268555, "geo/layer_0/attn_entropy_std": 0.43474000692367554, "geo/layer_7/stable_rank_q_proj": 42.4619026184082, "geo/layer_7/stable_rank_k_proj": 39.20180130004883, "geo/layer_7/stable_rank_o_proj": 90.01951599121094, "geo/layer_7/stable_rank_gate_proj": 79.45084381103516, "geo/layer_7/stable_rank_down_proj": 143.8085174560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41253215074539185, "geo/layer_7/attn_entropy_mean": 4.742349624633789, "geo/layer_7/attn_entropy_std": 0.7632347941398621, "geo/layer_14/stable_rank_q_proj": 51.75703048706055, "geo/layer_14/stable_rank_k_proj": 41.849483489990234, "geo/layer_14/stable_rank_o_proj": 42.73455810546875, "geo/layer_14/stable_rank_gate_proj": 71.88316345214844, "geo/layer_14/stable_rank_down_proj": 127.27784729003906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37721553444862366, "geo/layer_14/attn_entropy_mean": 5.480267524719238, "geo/layer_14/attn_entropy_std": 0.4738624095916748, "geo/layer_21/stable_rank_q_proj": 39.398681640625, "geo/layer_21/stable_rank_k_proj": 29.03441619873047, "geo/layer_21/stable_rank_o_proj": 66.02627563476562, "geo/layer_21/stable_rank_gate_proj": 62.13153839111328, "geo/layer_21/stable_rank_down_proj": 49.50654220581055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13933707773685455, "geo/layer_21/attn_entropy_mean": 5.861452102661133, "geo/layer_21/attn_entropy_std": 0.3150421380996704, "geo/layer_27/stable_rank_q_proj": 43.29084014892578, "geo/layer_27/stable_rank_k_proj": 30.486940383911133, "geo/layer_27/stable_rank_o_proj": 109.05329132080078, "geo/layer_27/stable_rank_gate_proj": 72.8640365600586, "geo/layer_27/stable_rank_down_proj": 127.70355224609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09604568779468536, "geo/layer_27/attn_entropy_mean": 4.329886436462402, "geo/layer_27/attn_entropy_std": 0.6568129658699036, "attnres/final_alpha/block_0": 0.25258558988571167, "attnres/block_norm/0": 1.7778208255767822, "attnres/final_alpha/block_1": 0.00405517453327775, "attnres/block_norm/1": 49905.9921875, "attnres/final_alpha/block_2": 0.008628303185105324, "attnres/block_norm/2": 29684.21484375, "attnres/final_alpha/block_3": 0.010475881397724152, "attnres/block_norm/3": 69814.484375, "attnres/final_alpha/block_4": 0.012003155425190926, "attnres/block_norm/4": 16831.869140625, "attnres/final_alpha/block_5": 0.6099463701248169, "attnres/block_norm/5": 7003.69140625, "attnres/final_alpha/block_6": 0.10230548679828644, "attnres/block_norm/6": 46098.828125, "geo/tier1_time_s": 1.3607966899871826, "geo/step": 5850.0, "geo/rankme_slope": 0.0010540695965886354} {"step": 5860, "timestamp": 1778332048.571708, "train/loss": 2.353048062324524, "train/z_loss": 0.001358139014337212, "train/perplexity": 10.517579153340925, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790962.337292711, "perf/iters_per_sec": 0.853997391363483, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1709637641906738, "data/tokens_consumed": 12291407872, "data/tokens_consumed_B": 12.291407872, "train/loss_slope": 4.43874652987294e-06} {"step": 5870, "timestamp": 1778332058.940273, "train/loss": 2.381751036643982, "train/z_loss": 0.0013622369384393095, "train/perplexity": 10.8238392190681, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024056.5230924091, "perf/iters_per_sec": 0.9651453605138822, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361133575439454, "data/tokens_consumed": 12312379392, "data/tokens_consumed_B": 12.312379392, "train/loss_slope": 5.608218387909277e-06} {"step": 5880, "timestamp": 1778332069.2905848, "train/loss": 2.3843004941940307, "train/z_loss": 0.001349067478440702, "train/perplexity": 10.851469343634806, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027698.030410031, "perf/iters_per_sec": 0.966881766514793, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342526197433473, "data/tokens_consumed": 12333350912, "data/tokens_consumed_B": 12.333350912, "train/loss_slope": 6.6325171087512134e-06} {"step": 5890, "timestamp": 1778332079.6365092, "train/loss": 2.3690850019454954, "train/z_loss": 0.001350085809826851, "train/perplexity": 10.687608667660415, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027968.9891426188, "perf/iters_per_sec": 0.9670109697068304, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341144323348999, "data/tokens_consumed": 12354322432, "data/tokens_consumed_B": 12.354322432, "train/loss_slope": 5.682543151699744e-06} {"step": 5900, "timestamp": 1778332089.9741032, "grad/layer_0/attn": 0.0033067611511796713, "grad/layer_0/mlp": 0.0033473975490778685, "grad/layer_0/attn_mlp_ratio": 0.9878602717220072, "grad/layer_4/attn": 0.0018807362066581845, "grad/layer_4/mlp": 0.0026934330817312002, "grad/layer_4/attn_mlp_ratio": 0.6982672595758785, "grad/layer_8/attn": 0.0035167932510375977, "grad/layer_8/mlp": 0.003416670486330986, "grad/layer_8/attn_mlp_ratio": 1.0293041609299916, "grad/layer_12/attn": 0.006958230398595333, "grad/layer_12/mlp": 0.007091651204973459, "grad/layer_12/attn_mlp_ratio": 0.9811862004150491, "grad/layer_16/attn": 0.003514982061460614, "grad/layer_16/mlp": 0.004809132311493158, "grad/layer_16/attn_mlp_ratio": 0.7308973346336433, "grad/layer_20/attn": 0.0032317577861249447, "grad/layer_20/mlp": 0.006457354873418808, "grad/layer_20/attn_mlp_ratio": 0.5004770218500022, "grad/layer_24/attn": 0.009316735900938511, "grad/layer_24/mlp": 0.009836168959736824, "grad/layer_24/attn_mlp_ratio": 0.9471915177907474, "grad/layer_27/attn": 0.007592500187456608, "grad/layer_27/mlp": 0.00876664463430643, "grad/layer_27/attn_mlp_ratio": 0.8660668268836019} {"step": 5900, "timestamp": 1778332089.9898548, "train/loss": 2.2978614568710327, "train/z_loss": 0.0013604391366243362, "train/perplexity": 9.952875027006135, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026730.9114388556, "perf/iters_per_sec": 0.9664206082529333, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347461462020875, "data/tokens_consumed": 12375293952, "data/tokens_consumed_B": 12.375293952, "train/loss_slope": 2.5064038758229912e-06} {"step": 5910, "timestamp": 1778332100.3518572, "train/loss": 2.382951331138611, "train/z_loss": 0.0013559486134909094, "train/perplexity": 10.836838813803777, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025361.6115758093, "perf/iters_per_sec": 0.9657676751975104, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354457139968871, "data/tokens_consumed": 12396265472, "data/tokens_consumed_B": 12.396265472, "train/loss_slope": 3.7479310646594424e-06} {"step": 5920, "timestamp": 1778332111.3830621, "train/loss": 2.3629288196563722, "train/z_loss": 0.0013536716112866997, "train/perplexity": 10.622015908115696, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1901891.6296502633, "perf/iters_per_sec": 0.9068925998927418, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.102666401863098, "data/tokens_consumed": 12417236992, "data/tokens_consumed_B": 12.417236992, "train/loss_slope": 3.371973149310727e-06} {"step": 5925, "timestamp": 1778332117.15792, "eos/sharpness": 15.946745872497555, "eos/L0_probe": 2.348224639892578, "eos/L_plus": 2.445977210998535, "eos/L_minus": 2.4099395275115967, "eos/grad_norm": 0.09282971173524857, "eos/embed_grad_frac": 0.2712554931640625, "eos/time_s": 0.6072571277618408} {"step": 5925, "timestamp": 1778332118.5423849, "geo/rankme_last": 429.067626953125, "geo/layer_0/stable_rank_q_proj": 20.713932037353516, "geo/layer_0/stable_rank_k_proj": 17.225982666015625, "geo/layer_0/stable_rank_o_proj": 45.165870666503906, "geo/layer_0/stable_rank_gate_proj": 128.44735717773438, "geo/layer_0/stable_rank_down_proj": 56.366607666015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06827820837497711, "geo/layer_0/attn_entropy_mean": 6.258026123046875, "geo/layer_0/attn_entropy_std": 0.43433430790901184, "geo/layer_7/stable_rank_q_proj": 42.5534782409668, "geo/layer_7/stable_rank_k_proj": 39.22977066040039, "geo/layer_7/stable_rank_o_proj": 90.05008697509766, "geo/layer_7/stable_rank_gate_proj": 79.4632339477539, "geo/layer_7/stable_rank_down_proj": 143.789794921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3998848795890808, "geo/layer_7/attn_entropy_mean": 4.771334648132324, "geo/layer_7/attn_entropy_std": 0.7609468102455139, "geo/layer_14/stable_rank_q_proj": 51.88035202026367, "geo/layer_14/stable_rank_k_proj": 41.95119857788086, "geo/layer_14/stable_rank_o_proj": 42.677425384521484, "geo/layer_14/stable_rank_gate_proj": 71.9321517944336, "geo/layer_14/stable_rank_down_proj": 127.00704193115234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3686196208000183, "geo/layer_14/attn_entropy_mean": 5.514156341552734, "geo/layer_14/attn_entropy_std": 0.4527699649333954, "geo/layer_21/stable_rank_q_proj": 39.38898468017578, "geo/layer_21/stable_rank_k_proj": 29.09337043762207, "geo/layer_21/stable_rank_o_proj": 66.0008544921875, "geo/layer_21/stable_rank_gate_proj": 62.12836837768555, "geo/layer_21/stable_rank_down_proj": 49.538822174072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.136769637465477, "geo/layer_21/attn_entropy_mean": 5.848667144775391, "geo/layer_21/attn_entropy_std": 0.321494460105896, "geo/layer_27/stable_rank_q_proj": 43.31542205810547, "geo/layer_27/stable_rank_k_proj": 30.46816635131836, "geo/layer_27/stable_rank_o_proj": 108.86619567871094, "geo/layer_27/stable_rank_gate_proj": 72.74585723876953, "geo/layer_27/stable_rank_down_proj": 127.69444274902344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10260258615016937, "geo/layer_27/attn_entropy_mean": 4.326574325561523, "geo/layer_27/attn_entropy_std": 0.6588056087493896, "attnres/final_alpha/block_0": 0.2514144778251648, "attnres/block_norm/0": 1.777677059173584, "attnres/final_alpha/block_1": 0.003982510417699814, "attnres/block_norm/1": 49979.140625, "attnres/final_alpha/block_2": 0.008726987987756729, "attnres/block_norm/2": 29678.00390625, "attnres/final_alpha/block_3": 0.010658414103090763, "attnres/block_norm/3": 69462.90625, "attnres/final_alpha/block_4": 0.011843382380902767, "attnres/block_norm/4": 16851.44921875, "attnres/final_alpha/block_5": 0.6117688417434692, "attnres/block_norm/5": 7028.42919921875, "attnres/final_alpha/block_6": 0.1016053557395935, "attnres/block_norm/6": 46054.93359375, "geo/tier1_time_s": 1.3623569011688232, "geo/step": 5925.0, "geo/rankme_slope": 0.0010731445117109343} {"step": 5930, "timestamp": 1778332123.7229893, "train/loss": 2.3375601768493652, "train/z_loss": 0.0013622576603665947, "train/perplexity": 10.355939054705859, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700329.8891594193, "perf/iters_per_sec": 0.810780472354612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2333794832229614, "data/tokens_consumed": 12438208512, "data/tokens_consumed_B": 12.438208512, "train/loss_slope": 5.845565320920617e-07} {"step": 5940, "timestamp": 1778332134.0714858, "train/loss": 2.390743613243103, "train/z_loss": 0.0013458573608659207, "train/perplexity": 10.921612379780983, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027310.417932445, "perf/iters_per_sec": 0.9666969384824967, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034450364112854, "data/tokens_consumed": 12459180032, "data/tokens_consumed_B": 12.459180032, "train/loss_slope": 3.0115602826914345e-06} {"step": 5950, "timestamp": 1778332144.4210598, "grad/layer_0/attn": 0.0044265552423894405, "grad/layer_0/mlp": 0.003811892354860902, "grad/layer_0/attn_mlp_ratio": 1.1612487222048262, "grad/layer_4/attn": 0.0018568646628409624, "grad/layer_4/mlp": 0.0027775694616138935, "grad/layer_4/attn_mlp_ratio": 0.6685213895280592, "grad/layer_8/attn": 0.004717654082924128, "grad/layer_8/mlp": 0.0035244685132056475, "grad/layer_8/attn_mlp_ratio": 1.3385433665795166, "grad/layer_12/attn": 0.005927790887653828, "grad/layer_12/mlp": 0.007440426852554083, "grad/layer_12/attn_mlp_ratio": 0.7967003675264884, "grad/layer_16/attn": 0.005495138466358185, "grad/layer_16/mlp": 0.00502615375444293, "grad/layer_16/attn_mlp_ratio": 1.0933088451919732, "grad/layer_20/attn": 0.004442552570253611, "grad/layer_20/mlp": 0.006380763836205006, "grad/layer_20/attn_mlp_ratio": 0.6962414868611866, "grad/layer_24/attn": 0.015397815965116024, "grad/layer_24/mlp": 0.011428759433329105, "grad/layer_24/attn_mlp_ratio": 1.3472867217314497, "grad/layer_27/attn": 0.016072850674390793, "grad/layer_27/mlp": 0.012921774759888649, "grad/layer_27/attn_mlp_ratio": 1.2438578174182255} {"step": 5950, "timestamp": 1778332144.4382136, "train/loss": 2.3540595531463624, "train/z_loss": 0.0013562406064011156, "train/perplexity": 10.528222970276573, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023954.5749433162, "perf/iters_per_sec": 0.9650967478481847, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361655473709106, "data/tokens_consumed": 12480151552, "data/tokens_consumed_B": 12.480151552, "train/loss_slope": 4.6101333212999816e-07} {"step": 5960, "timestamp": 1778332154.816028, "train/loss": 2.3454883813858034, "train/z_loss": 0.00136369401589036, "train/perplexity": 10.438369388197852, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022370.0413850339, "perf/iters_per_sec": 0.9643411833691758, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369773864746095, "data/tokens_consumed": 12501123072, "data/tokens_consumed_B": 12.501123072, "train/loss_slope": -1.6828865179456326e-06} {"step": 5970, "timestamp": 1778332165.196464, "train/loss": 2.376031827926636, "train/z_loss": 0.0013504681875929237, "train/perplexity": 10.762112106824848, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021605.8616315515, "perf/iters_per_sec": 0.963976794067169, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373693704605103, "data/tokens_consumed": 12522094592, "data/tokens_consumed_B": 12.522094592, "train/loss_slope": 1.719566139772836e-06} {"step": 5980, "timestamp": 1778332175.575017, "train/loss": 2.3428271055221557, "train/z_loss": 0.0013577373349107802, "train/perplexity": 10.410626939217703, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021612.412874161, "perf/iters_per_sec": 0.9639799179430776, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037366008758545, "data/tokens_consumed": 12543066112, "data/tokens_consumed_B": 12.543066112, "train/loss_slope": 2.1002532267690747e-07} {"step": 5990, "timestamp": 1778332185.9512131, "train/loss": 2.3999654769897463, "train/z_loss": 0.0013502957997843623, "train/perplexity": 11.022795833979231, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022536.330817591, "perf/iters_per_sec": 0.9644204763496356, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368921279907226, "data/tokens_consumed": 12564037632, "data/tokens_consumed_B": 12.564037632, "train/loss_slope": -3.163937580968175e-07} {"step": 6000, "timestamp": 1778332196.3281162, "grad/layer_0/attn": 0.0027464372105896473, "grad/layer_0/mlp": 0.0032621591817587614, "grad/layer_0/attn_mlp_ratio": 0.841907759056122, "grad/layer_4/attn": 0.0032283421605825424, "grad/layer_4/mlp": 0.002597527112811804, "grad/layer_4/attn_mlp_ratio": 1.242852103592742, "grad/layer_8/attn": 0.005911311600357294, "grad/layer_8/mlp": 0.0034457857254892588, "grad/layer_8/attn_mlp_ratio": 1.7155191586865273, "grad/layer_12/attn": 0.006173672620207071, "grad/layer_12/mlp": 0.007046741433441639, "grad/layer_12/attn_mlp_ratio": 0.8761031734892993, "grad/layer_16/attn": 0.004502131137996912, "grad/layer_16/mlp": 0.004613478202372789, "grad/layer_16/attn_mlp_ratio": 0.9758648123870857, "grad/layer_20/attn": 0.0042214966379106045, "grad/layer_20/mlp": 0.006091385148465633, "grad/layer_20/attn_mlp_ratio": 0.6930273600695281, "grad/layer_24/attn": 0.007357487455010414, "grad/layer_24/mlp": 0.009640036150813103, "grad/layer_24/attn_mlp_ratio": 0.7632219696673689, "grad/layer_27/attn": 0.00581577280536294, "grad/layer_27/mlp": 0.009799575433135033, "grad/layer_27/attn_mlp_ratio": 0.5934719096452932} {"step": 6000, "timestamp": 1778332196.924836, "eos/sharpness": 28.67708206176757, "eos/L0_probe": 2.3466603755950928, "eos/L_plus": 2.5018911361694336, "eos/L_minus": 2.4782004356384277, "eos/grad_norm": 0.1383131593465805, "eos/embed_grad_frac": 0.14002865552902222, "eos/time_s": 0.5940001010894775} {"step": 6000, "timestamp": 1778332196.944302, "train/loss": 2.3870102405548095, "train/z_loss": 0.001353984826710075, "train/perplexity": 10.880913948886818, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908955.7163868702, "perf/iters_per_sec": 0.9102610189375259, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0985859870910644, "data/tokens_consumed": 12585009152, "data/tokens_consumed_B": 12.585009152, "train/loss_slope": -1.7768880464706692e-06} {"step": 6000, "timestamp": 1778332198.3030255, "geo/rankme_last": 428.06072998046875, "geo/layer_0/stable_rank_q_proj": 20.721240997314453, "geo/layer_0/stable_rank_k_proj": 17.204252243041992, "geo/layer_0/stable_rank_o_proj": 45.10215377807617, "geo/layer_0/stable_rank_gate_proj": 128.6457061767578, "geo/layer_0/stable_rank_down_proj": 56.43661880493164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0643789991736412, "geo/layer_0/attn_entropy_mean": 6.255772590637207, "geo/layer_0/attn_entropy_std": 0.4311220645904541, "geo/layer_7/stable_rank_q_proj": 42.56399917602539, "geo/layer_7/stable_rank_k_proj": 39.232181549072266, "geo/layer_7/stable_rank_o_proj": 90.03682708740234, "geo/layer_7/stable_rank_gate_proj": 79.4400634765625, "geo/layer_7/stable_rank_down_proj": 143.95770263671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.402270644903183, "geo/layer_7/attn_entropy_mean": 4.692392349243164, "geo/layer_7/attn_entropy_std": 0.7574248909950256, "geo/layer_14/stable_rank_q_proj": 51.88687515258789, "geo/layer_14/stable_rank_k_proj": 42.03886032104492, "geo/layer_14/stable_rank_o_proj": 42.629852294921875, "geo/layer_14/stable_rank_gate_proj": 71.91957092285156, "geo/layer_14/stable_rank_down_proj": 126.96932983398438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38324111700057983, "geo/layer_14/attn_entropy_mean": 5.510010242462158, "geo/layer_14/attn_entropy_std": 0.4479893445968628, "geo/layer_21/stable_rank_q_proj": 39.340354919433594, "geo/layer_21/stable_rank_k_proj": 29.08648109436035, "geo/layer_21/stable_rank_o_proj": 65.9949722290039, "geo/layer_21/stable_rank_gate_proj": 62.057823181152344, "geo/layer_21/stable_rank_down_proj": 49.567848205566406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13941700756549835, "geo/layer_21/attn_entropy_mean": 5.860372543334961, "geo/layer_21/attn_entropy_std": 0.3171992301940918, "geo/layer_27/stable_rank_q_proj": 43.31553649902344, "geo/layer_27/stable_rank_k_proj": 30.426475524902344, "geo/layer_27/stable_rank_o_proj": 108.61245727539062, "geo/layer_27/stable_rank_gate_proj": 72.6707992553711, "geo/layer_27/stable_rank_down_proj": 127.62968444824219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10462956130504608, "geo/layer_27/attn_entropy_mean": 4.3206377029418945, "geo/layer_27/attn_entropy_std": 0.6437943577766418, "attnres/final_alpha/block_0": 0.2537636458873749, "attnres/block_norm/0": 1.7777645587921143, "attnres/final_alpha/block_1": 0.00405044574290514, "attnres/block_norm/1": 49964.4296875, "attnres/final_alpha/block_2": 0.008765939623117447, "attnres/block_norm/2": 29634.6640625, "attnres/final_alpha/block_3": 0.010709347203373909, "attnres/block_norm/3": 69523.4765625, "attnres/final_alpha/block_4": 0.011991984210908413, "attnres/block_norm/4": 16892.8203125, "attnres/final_alpha/block_5": 0.6086304187774658, "attnres/block_norm/5": 7053.6650390625, "attnres/final_alpha/block_6": 0.10208819806575775, "attnres/block_norm/6": 46069.484375, "geo/tier1_time_s": 1.354870319366455, "geo/step": 6000.0, "geo/rankme_slope": 0.0010489591735131554} {"step": 6000, "timestamp": 1778332205.131176, "geo/ww_alpha_mean": 7.5805281690815205, "geo/ww_alpha_std": 4.331271389356002, "geo/ww_alpha_min": 1.3498079515183532, "geo/ww_alpha_max": 26.380312333313793, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.902864725394893, "geo/ww_alpha_by_type/k_proj": 4.378073620917511, "geo/ww_alpha_by_type/v_proj": 9.175342537518988, "geo/ww_alpha_by_type/o_proj": 8.747799295527708, "geo/ww_alpha_by_type/gate_proj": 7.825989818860644, "geo/ww_alpha_by_type/up_proj": 11.118162453346487, "geo/ww_alpha_by_type/down_proj": 8.008653312897723, "geo/twonn_id/layer_0": 0.7108045816421509, "geo/twonn_id/layer_7": 3.3352210521698, "geo/twonn_id/layer_14": 5.227488994598389, "geo/twonn_id/layer_21": 9.03221607208252, "geo/twonn_id/layer_27": 7.222954273223877, "geo/tier2_time_s": 6.819701910018921} {"step": 6000, "timestamp": 1778332205.9711213, "eoc/jacobian_sigma/layer_0/attn": 1436.968505859375, "eoc/jacobian_sigma/layer_0/mlp": 11128.6201171875, "eoc/jacobian_sigma/layer_0": 11128.6201171875, "eoc/jacobian_sigma/layer_7/attn": 1.1426833868026733, "eoc/jacobian_sigma/layer_7/mlp": 1.7915174961090088, "eoc/jacobian_sigma/layer_7": 1.7915174961090088, "eoc/jacobian_sigma/layer_14/attn": 1.787282943725586, "eoc/jacobian_sigma/layer_14/mlp": 14.00670337677002, "eoc/jacobian_sigma/layer_14": 14.00670337677002, "eoc/jacobian_sigma/layer_21/attn": 1.0930525064468384, "eoc/jacobian_sigma/layer_21/mlp": 5.418606758117676, "eoc/jacobian_sigma/layer_21": 5.418606758117676, "eoc/jacobian_sigma/layer_27/attn": 3.76010799407959, "eoc/jacobian_sigma/layer_27/mlp": 31.032730102539062, "eoc/jacobian_sigma/layer_27": 31.032730102539062, "eoc/layer0_sigma": 11128.6201171875, "eoc/sigma_max": 31.032730102539062, "eoc/sigma_min": 1.7915174961090088, "eoc/sigma_mean": 13.062389433383942, "eoc/time_s": 0.8330216407775879} {"step": 6010, "timestamp": 1778332216.3683462, "train/loss": 2.3491148233413695, "train/z_loss": 0.0013480146531946956, "train/perplexity": 10.476292249862674, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1079891.2987473197, "perf/iters_per_sec": 0.5149322980629538, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9420028686523438, "data/tokens_consumed": 12605980672, "data/tokens_consumed_B": 12.605980672, "train/loss_slope": -4.693126106204994e-06} {"step": 6020, "timestamp": 1778332226.7537894, "train/loss": 2.3497470378875733, "train/z_loss": 0.0013649808010086418, "train/perplexity": 10.482917608316637, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020573.8976102234, "perf/iters_per_sec": 0.963484715275871, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378991842269898, "data/tokens_consumed": 12626952192, "data/tokens_consumed_B": 12.626952192, "train/loss_slope": -5.847048695080995e-06} {"step": 6030, "timestamp": 1778332237.131375, "train/loss": 2.3783838987350463, "train/z_loss": 0.0013614151976071298, "train/perplexity": 10.787455149179568, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021959.875418614, "perf/iters_per_sec": 0.9641456009953565, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371877431869507, "data/tokens_consumed": 12647923712, "data/tokens_consumed_B": 12.647923712, "train/loss_slope": -4.075647895485655e-06} {"step": 6040, "timestamp": 1778332247.506921, "train/loss": 2.3713347911834717, "train/z_loss": 0.0013598452205769718, "train/perplexity": 10.711680602857546, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022774.8848542515, "perf/iters_per_sec": 0.9645342277785547, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367698431015016, "data/tokens_consumed": 12668895232, "data/tokens_consumed_B": 12.668895232, "train/loss_slope": -2.939233427966207e-06} {"step": 6050, "timestamp": 1778332257.8693655, "grad/layer_0/attn": 0.0032638590782880783, "grad/layer_0/mlp": 0.0033263913355767727, "grad/layer_0/attn_mlp_ratio": 0.9812011428901928, "grad/layer_4/attn": 0.002177333692088723, "grad/layer_4/mlp": 0.002729839878156781, "grad/layer_4/attn_mlp_ratio": 0.7976048814256445, "grad/layer_8/attn": 0.006814268883317709, "grad/layer_8/mlp": 0.0035840512719005346, "grad/layer_8/attn_mlp_ratio": 1.901275449549226, "grad/layer_12/attn": 0.005823859013617039, "grad/layer_12/mlp": 0.007093393709510565, "grad/layer_12/attn_mlp_ratio": 0.8210257557966992, "grad/layer_16/attn": 0.003601653268560767, "grad/layer_16/mlp": 0.0051195318810641766, "grad/layer_16/attn_mlp_ratio": 0.7035122120307775, "grad/layer_20/attn": 0.00441515538841486, "grad/layer_20/mlp": 0.0068223485723137856, "grad/layer_20/attn_mlp_ratio": 0.6471606187957364, "grad/layer_24/attn": 0.011705375276505947, "grad/layer_24/mlp": 0.012123802676796913, "grad/layer_24/attn_mlp_ratio": 0.965487107634927, "grad/layer_27/attn": 0.006557552143931389, "grad/layer_27/mlp": 0.012785716913640499, "grad/layer_27/attn_mlp_ratio": 0.5128810638414283} {"step": 6050, "timestamp": 1778332257.8862307, "train/loss": 2.3786647081375123, "train/z_loss": 0.0013426113408058881, "train/perplexity": 10.790484793370517, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021663.6627124744, "perf/iters_per_sec": 0.9640043557703373, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03733971118927, "data/tokens_consumed": 12689866752, "data/tokens_consumed_B": 12.689866752, "train/loss_slope": -2.5587509245690742e-06} {"step": 6060, "timestamp": 1778332268.2419763, "train/loss": 2.352615571022034, "train/z_loss": 0.0013497115112841129, "train/perplexity": 10.513031375341956, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026175.399009746, "perf/iters_per_sec": 0.9661557192848902, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350298404693603, "data/tokens_consumed": 12710838272, "data/tokens_consumed_B": 12.710838272, "train/loss_slope": -2.2497681143617904e-06} {"step": 6070, "timestamp": 1778332278.597922, "train/loss": 2.357884430885315, "train/z_loss": 0.001343588752206415, "train/perplexity": 10.568569246545506, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026791.387722659, "perf/iters_per_sec": 0.9664494455922408, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347152709960938, "data/tokens_consumed": 12731809792, "data/tokens_consumed_B": 12.731809792, "train/loss_slope": -5.0517385370052675e-06} {"step": 6075, "timestamp": 1778332284.3780262, "eos/sharpness": 28.214192390441887, "eos/L0_probe": 2.3475005626678467, "eos/L_plus": 2.4742236137390137, "eos/L_minus": 2.5029194355010986, "eos/grad_norm": 0.10185618698596954, "eos/embed_grad_frac": 0.22415593266487122, "eos/time_s": 0.6181046962738037} {"step": 6075, "timestamp": 1778332285.7589624, "geo/rankme_last": 427.6833190917969, "geo/layer_0/stable_rank_q_proj": 20.726072311401367, "geo/layer_0/stable_rank_k_proj": 17.21480941772461, "geo/layer_0/stable_rank_o_proj": 45.071815490722656, "geo/layer_0/stable_rank_gate_proj": 128.6011505126953, "geo/layer_0/stable_rank_down_proj": 56.39686965942383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0676790401339531, "geo/layer_0/attn_entropy_mean": 6.255849838256836, "geo/layer_0/attn_entropy_std": 0.43343648314476013, "geo/layer_7/stable_rank_q_proj": 42.505035400390625, "geo/layer_7/stable_rank_k_proj": 39.124935150146484, "geo/layer_7/stable_rank_o_proj": 90.06548309326172, "geo/layer_7/stable_rank_gate_proj": 79.48358154296875, "geo/layer_7/stable_rank_down_proj": 144.11827087402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4085477590560913, "geo/layer_7/attn_entropy_mean": 4.732576370239258, "geo/layer_7/attn_entropy_std": 0.7561647295951843, "geo/layer_14/stable_rank_q_proj": 51.76047134399414, "geo/layer_14/stable_rank_k_proj": 42.042152404785156, "geo/layer_14/stable_rank_o_proj": 42.595699310302734, "geo/layer_14/stable_rank_gate_proj": 71.85198974609375, "geo/layer_14/stable_rank_down_proj": 127.08411407470703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3703668415546417, "geo/layer_14/attn_entropy_mean": 5.500569820404053, "geo/layer_14/attn_entropy_std": 0.43049830198287964, "geo/layer_21/stable_rank_q_proj": 39.32725524902344, "geo/layer_21/stable_rank_k_proj": 29.016691207885742, "geo/layer_21/stable_rank_o_proj": 65.97833251953125, "geo/layer_21/stable_rank_gate_proj": 62.039466857910156, "geo/layer_21/stable_rank_down_proj": 49.49591064453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13648805022239685, "geo/layer_21/attn_entropy_mean": 5.867840766906738, "geo/layer_21/attn_entropy_std": 0.32008329033851624, "geo/layer_27/stable_rank_q_proj": 43.33480453491211, "geo/layer_27/stable_rank_k_proj": 30.415449142456055, "geo/layer_27/stable_rank_o_proj": 108.55939483642578, "geo/layer_27/stable_rank_gate_proj": 72.6239013671875, "geo/layer_27/stable_rank_down_proj": 127.85281372070312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0945730209350586, "geo/layer_27/attn_entropy_mean": 4.333559036254883, "geo/layer_27/attn_entropy_std": 0.6468350887298584, "attnres/final_alpha/block_0": 0.25482410192489624, "attnres/block_norm/0": 1.7777565717697144, "attnres/final_alpha/block_1": 0.004115729127079248, "attnres/block_norm/1": 49964.2734375, "attnres/final_alpha/block_2": 0.008929421193897724, "attnres/block_norm/2": 29632.697265625, "attnres/final_alpha/block_3": 0.010794168338179588, "attnres/block_norm/3": 69658.015625, "attnres/final_alpha/block_4": 0.012229962274432182, "attnres/block_norm/4": 16928.05078125, "attnres/final_alpha/block_5": 0.605398952960968, "attnres/block_norm/5": 7097.0419921875, "attnres/final_alpha/block_6": 0.1037076786160469, "attnres/block_norm/6": 46103.6484375, "geo/tier1_time_s": 1.3611927032470703, "geo/step": 6075.0, "geo/rankme_slope": 0.0010217593873486895} {"step": 6080, "timestamp": 1778332290.9425018, "train/loss": 2.3059642791748045, "train/z_loss": 0.0013654253096319735, "train/perplexity": 10.033849020669154, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699406.956947653, "perf/iters_per_sec": 0.8103403839815393, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340493202209473, "data/tokens_consumed": 12752781312, "data/tokens_consumed_B": 12.752781312, "train/loss_slope": -8.026795583267762e-06} {"step": 6090, "timestamp": 1778332301.3156314, "train/loss": 2.376049780845642, "train/z_loss": 0.0013448703452013434, "train/perplexity": 10.7623053198862, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025655.5969170383, "perf/iters_per_sec": 0.9659078583321754, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352954387664794, "data/tokens_consumed": 12773752832, "data/tokens_consumed_B": 12.773752832, "train/loss_slope": -7.124343005665965e-06} {"step": 6100, "timestamp": 1778332311.6737406, "grad/layer_0/attn": 0.004217271693050861, "grad/layer_0/mlp": 0.004131549969315529, "grad/layer_0/attn_mlp_ratio": 1.0207480539500116, "grad/layer_4/attn": 0.0022389059886336327, "grad/layer_4/mlp": 0.0027648627292364836, "grad/layer_4/attn_mlp_ratio": 0.8097710906156983, "grad/layer_8/attn": 0.0042120469734072685, "grad/layer_8/mlp": 0.0035119724925607443, "grad/layer_8/attn_mlp_ratio": 1.1993393633906657, "grad/layer_12/attn": 0.007796968799084425, "grad/layer_12/mlp": 0.0074376934207975864, "grad/layer_12/attn_mlp_ratio": 1.0483046629015056, "grad/layer_16/attn": 0.003944749012589455, "grad/layer_16/mlp": 0.005150485783815384, "grad/layer_16/attn_mlp_ratio": 0.7658984223188005, "grad/layer_20/attn": 0.005247749388217926, "grad/layer_20/mlp": 0.007477704901248217, "grad/layer_20/attn_mlp_ratio": 0.7017860944423383, "grad/layer_24/attn": 0.020085806027054787, "grad/layer_24/mlp": 0.015215895138680935, "grad/layer_24/attn_mlp_ratio": 1.3200541744000613, "grad/layer_27/attn": 0.00589675921946764, "grad/layer_27/mlp": 0.015980569645762444, "grad/layer_27/attn_mlp_ratio": 0.3689955560583989} {"step": 6100, "timestamp": 1778332311.6897292, "train/loss": 2.3452614307403565, "train/z_loss": 0.0013553234399296342, "train/perplexity": 10.436000662329887, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022821.0766086858, "perf/iters_per_sec": 0.9645562537234715, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367461681365966, "data/tokens_consumed": 12794724352, "data/tokens_consumed_B": 12.794724352, "train/loss_slope": -8.939108189040067e-06} {"step": 6110, "timestamp": 1778332322.0379145, "train/loss": 2.35627658367157, "train/z_loss": 0.0013517288025468588, "train/perplexity": 10.551590255398784, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027556.1285400747, "perf/iters_per_sec": 0.9668141024303792, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343250036239624, "data/tokens_consumed": 12815695872, "data/tokens_consumed_B": 12.815695872, "train/loss_slope": -9.206703236394215e-06} {"step": 6120, "timestamp": 1778332332.4020386, "train/loss": 2.4045725107192992, "train/z_loss": 0.0013455661712214351, "train/perplexity": 11.07369538412499, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024731.950217257, "perf/iters_per_sec": 0.965467429264668, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357677221298218, "data/tokens_consumed": 12836667392, "data/tokens_consumed_B": 12.836667392, "train/loss_slope": -3.7066205142319302e-06} {"step": 6130, "timestamp": 1778332342.7535036, "train/loss": 2.359908938407898, "train/z_loss": 0.0013511423370800913, "train/perplexity": 10.589987067442529, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027419.713946919, "perf/iters_per_sec": 0.9667490548834414, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343945980072022, "data/tokens_consumed": 12857638912, "data/tokens_consumed_B": 12.857638912, "train/loss_slope": -3.4165742910198164e-06} {"step": 6140, "timestamp": 1778332353.1030204, "train/loss": 2.341030740737915, "train/z_loss": 0.0013629548251628877, "train/perplexity": 10.39194244271245, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027251.8730120363, "perf/iters_per_sec": 0.9666690220890218, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344802379608153, "data/tokens_consumed": 12878610432, "data/tokens_consumed_B": 12.878610432, "train/loss_slope": -4.76673610068072e-06} {"step": 6150, "timestamp": 1778332363.451068, "grad/layer_0/attn": 0.0029351194389164448, "grad/layer_0/mlp": 0.0032968856394290924, "grad/layer_0/attn_mlp_ratio": 0.890270294725078, "grad/layer_4/attn": 0.002426251769065857, "grad/layer_4/mlp": 0.0025531535502523184, "grad/layer_4/attn_mlp_ratio": 0.950296026573285, "grad/layer_8/attn": 0.008633214049041271, "grad/layer_8/mlp": 0.0034888857044279575, "grad/layer_8/attn_mlp_ratio": 2.4744902908786335, "grad/layer_12/attn": 0.006695897318422794, "grad/layer_12/mlp": 0.007097790017724037, "grad/layer_12/attn_mlp_ratio": 0.9433777566488382, "grad/layer_16/attn": 0.004030412994325161, "grad/layer_16/mlp": 0.004299336578696966, "grad/layer_16/attn_mlp_ratio": 0.9374499592682959, "grad/layer_20/attn": 0.00634158356115222, "grad/layer_20/mlp": 0.006595499813556671, "grad/layer_20/attn_mlp_ratio": 0.9615015759635535, "grad/layer_24/attn": 0.013726627454161644, "grad/layer_24/mlp": 0.013244579546153545, "grad/layer_24/attn_mlp_ratio": 1.036395855579161, "grad/layer_27/attn": 0.006306841038167477, "grad/layer_27/mlp": 0.014489740133285522, "grad/layer_27/attn_mlp_ratio": 0.4352625331184017} {"step": 6150, "timestamp": 1778332364.056271, "eos/sharpness": 53.69977951049804, "eos/L0_probe": 2.3490054607391357, "eos/L_plus": 2.645247459411621, "eos/L_minus": 2.589761257171631, "eos/grad_norm": 0.21324104070663452, "eos/embed_grad_frac": 0.057827915996313095, "eos/time_s": 0.6023108959197998} {"step": 6150, "timestamp": 1778332364.0761108, "train/loss": 2.3260026216506957, "train/z_loss": 0.0013584299362264574, "train/perplexity": 10.236938718564039, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912391.7352173368, "perf/iters_per_sec": 0.911899440392178, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0966121435165406, "data/tokens_consumed": 12899581952, "data/tokens_consumed_B": 12.899581952, "train/loss_slope": -7.970850009157146e-06} {"step": 6150, "timestamp": 1778332365.439502, "geo/rankme_last": 427.19085693359375, "geo/layer_0/stable_rank_q_proj": 20.728790283203125, "geo/layer_0/stable_rank_k_proj": 17.207496643066406, "geo/layer_0/stable_rank_o_proj": 45.05820846557617, "geo/layer_0/stable_rank_gate_proj": 128.83399963378906, "geo/layer_0/stable_rank_down_proj": 56.454994201660156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06631340086460114, "geo/layer_0/attn_entropy_mean": 6.249876976013184, "geo/layer_0/attn_entropy_std": 0.4408379793167114, "geo/layer_7/stable_rank_q_proj": 42.60074234008789, "geo/layer_7/stable_rank_k_proj": 39.15979766845703, "geo/layer_7/stable_rank_o_proj": 89.9646224975586, "geo/layer_7/stable_rank_gate_proj": 79.50975799560547, "geo/layer_7/stable_rank_down_proj": 144.26513671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39953532814979553, "geo/layer_7/attn_entropy_mean": 4.715321063995361, "geo/layer_7/attn_entropy_std": 0.7534964680671692, "geo/layer_14/stable_rank_q_proj": 51.815277099609375, "geo/layer_14/stable_rank_k_proj": 42.094154357910156, "geo/layer_14/stable_rank_o_proj": 42.555511474609375, "geo/layer_14/stable_rank_gate_proj": 71.93368530273438, "geo/layer_14/stable_rank_down_proj": 126.97988891601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3723941743373871, "geo/layer_14/attn_entropy_mean": 5.504904747009277, "geo/layer_14/attn_entropy_std": 0.4641738831996918, "geo/layer_21/stable_rank_q_proj": 39.32552719116211, "geo/layer_21/stable_rank_k_proj": 29.030513763427734, "geo/layer_21/stable_rank_o_proj": 66.08041381835938, "geo/layer_21/stable_rank_gate_proj": 62.01601791381836, "geo/layer_21/stable_rank_down_proj": 49.573150634765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13855265080928802, "geo/layer_21/attn_entropy_mean": 5.84786319732666, "geo/layer_21/attn_entropy_std": 0.3218421936035156, "geo/layer_27/stable_rank_q_proj": 43.42459487915039, "geo/layer_27/stable_rank_k_proj": 30.501319885253906, "geo/layer_27/stable_rank_o_proj": 108.66753387451172, "geo/layer_27/stable_rank_gate_proj": 72.55075073242188, "geo/layer_27/stable_rank_down_proj": 128.03302001953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09464376419782639, "geo/layer_27/attn_entropy_mean": 4.317059516906738, "geo/layer_27/attn_entropy_std": 0.6624712347984314, "attnres/final_alpha/block_0": 0.2512455880641937, "attnres/block_norm/0": 1.77762770652771, "attnres/final_alpha/block_1": 0.004016859456896782, "attnres/block_norm/1": 49911.34765625, "attnres/final_alpha/block_2": 0.0085279056802392, "attnres/block_norm/2": 29750.064453125, "attnres/final_alpha/block_3": 0.0105174221098423, "attnres/block_norm/3": 69640.78125, "attnres/final_alpha/block_4": 0.011888982728123665, "attnres/block_norm/4": 16836.40625, "attnres/final_alpha/block_5": 0.6138392686843872, "attnres/block_norm/5": 7002.837890625, "attnres/final_alpha/block_6": 0.09996393322944641, "attnres/block_norm/6": 46633.05078125, "geo/tier1_time_s": 1.3600592613220215, "geo/step": 6150.0, "geo/rankme_slope": 0.0009768599236569628} {"step": 6160, "timestamp": 1778332375.7903788, "train/loss": 2.3857324600219725, "train/z_loss": 0.0013567698653787375, "train/perplexity": 10.867019407840981, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790744.4819722737, "perf/iters_per_sec": 0.8538935098515862, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171106219291687, "data/tokens_consumed": 12920553472, "data/tokens_consumed_B": 12.920553472, "train/loss_slope": -5.5385609914904315e-06} {"step": 6170, "timestamp": 1778332386.142768, "train/loss": 2.3808919191360474, "train/z_loss": 0.0013582455925643445, "train/perplexity": 10.814544262593554, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027046.081512708, "perf/iters_per_sec": 0.9665708930552998, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345852613449096, "data/tokens_consumed": 12941524992, "data/tokens_consumed_B": 12.941524992, "train/loss_slope": -4.891989133109826e-06} {"step": 6180, "timestamp": 1778332396.4985995, "train/loss": 2.3608824253082275, "train/z_loss": 0.0013457058579660953, "train/perplexity": 10.600301300698277, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026121.4466390994, "perf/iters_per_sec": 0.9661299927897927, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350574016571046, "data/tokens_consumed": 12962496512, "data/tokens_consumed_B": 12.962496512, "train/loss_slope": -4.044599201169246e-06} {"step": 6190, "timestamp": 1778332406.8502507, "train/loss": 2.337213397026062, "train/z_loss": 0.0013578175217844545, "train/perplexity": 10.352348446601534, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026849.8593451122, "perf/iters_per_sec": 0.9664773270345269, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346854209899903, "data/tokens_consumed": 12983468032, "data/tokens_consumed_B": 12.983468032, "train/loss_slope": -7.362945624167564e-06} {"step": 6200, "timestamp": 1778332417.1898763, "grad/layer_0/attn": 0.0033619957976043224, "grad/layer_0/mlp": 0.0033003243152052164, "grad/layer_0/attn_mlp_ratio": 1.0186864606748882, "grad/layer_4/attn": 0.002106380881741643, "grad/layer_4/mlp": 0.0027553208637982607, "grad/layer_4/attn_mlp_ratio": 0.7644774998691833, "grad/layer_8/attn": 0.00626843236386776, "grad/layer_8/mlp": 0.003455423517152667, "grad/layer_8/attn_mlp_ratio": 1.814085061163373, "grad/layer_12/attn": 0.005881257355213165, "grad/layer_12/mlp": 0.0075648026540875435, "grad/layer_12/attn_mlp_ratio": 0.7774501922122562, "grad/layer_16/attn": 0.004000029526650906, "grad/layer_16/mlp": 0.004464388359338045, "grad/layer_16/attn_mlp_ratio": 0.8959859929491907, "grad/layer_20/attn": 0.0026010728906840086, "grad/layer_20/mlp": 0.005670798476785421, "grad/layer_20/attn_mlp_ratio": 0.45867841283096084, "grad/layer_24/attn": 0.01017093937844038, "grad/layer_24/mlp": 0.010613265447318554, "grad/layer_24/attn_mlp_ratio": 0.9583232731805219, "grad/layer_27/attn": 0.0037549587432295084, "grad/layer_27/mlp": 0.010516084730625153, "grad/layer_27/attn_mlp_ratio": 0.35706812979429786} {"step": 6200, "timestamp": 1778332417.2055511, "train/loss": 2.397010087966919, "train/z_loss": 0.0013532154844142497, "train/perplexity": 10.990267275119757, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026020.7836780185, "perf/iters_per_sec": 0.9660819929494946, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351088285446166, "data/tokens_consumed": 13004439552, "data/tokens_consumed_B": 13.004439552, "train/loss_slope": -6.270846510806206e-06} {"step": 6210, "timestamp": 1778332427.5607219, "train/loss": 2.332252526283264, "train/z_loss": 0.0013706346042454242, "train/perplexity": 10.301118960566871, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026800.5412044008, "perf/iters_per_sec": 0.9664538103124622, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347105979919433, "data/tokens_consumed": 13025411072, "data/tokens_consumed_B": 13.025411072, "train/loss_slope": -6.264253425197602e-06} {"step": 6220, "timestamp": 1778332437.9089828, "train/loss": 2.2945557117462156, "train/z_loss": 0.00136263637105003, "train/perplexity": 9.92002768129701, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027605.857369579, "perf/iters_per_sec": 0.9668378149841208, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034299635887146, "data/tokens_consumed": 13046382592, "data/tokens_consumed_B": 13.046382592, "train/loss_slope": -1.259694794724381e-05} {"step": 6225, "timestamp": 1778332443.6627336, "eos/sharpness": 66.22860431671141, "eos/L0_probe": 2.3465065956115723, "eos/L_plus": 2.749093532562256, "eos/L_minus": 2.606205701828003, "eos/grad_norm": 0.21382547914981842, "eos/embed_grad_frac": 0.049254726618528366, "eos/time_s": 0.5927374362945557} {"step": 6225, "timestamp": 1778332445.043963, "geo/rankme_last": 428.5435791015625, "geo/layer_0/stable_rank_q_proj": 20.736684799194336, "geo/layer_0/stable_rank_k_proj": 17.19087791442871, "geo/layer_0/stable_rank_o_proj": 45.00714111328125, "geo/layer_0/stable_rank_gate_proj": 128.7458038330078, "geo/layer_0/stable_rank_down_proj": 56.47544479370117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06782516092061996, "geo/layer_0/attn_entropy_mean": 6.251072406768799, "geo/layer_0/attn_entropy_std": 0.4385982155799866, "geo/layer_7/stable_rank_q_proj": 42.492794036865234, "geo/layer_7/stable_rank_k_proj": 39.07998275756836, "geo/layer_7/stable_rank_o_proj": 89.93476867675781, "geo/layer_7/stable_rank_gate_proj": 79.64999389648438, "geo/layer_7/stable_rank_down_proj": 144.02243041992188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4068439304828644, "geo/layer_7/attn_entropy_mean": 4.712871551513672, "geo/layer_7/attn_entropy_std": 0.781704843044281, "geo/layer_14/stable_rank_q_proj": 51.773529052734375, "geo/layer_14/stable_rank_k_proj": 42.07445526123047, "geo/layer_14/stable_rank_o_proj": 42.55125427246094, "geo/layer_14/stable_rank_gate_proj": 71.94679260253906, "geo/layer_14/stable_rank_down_proj": 127.00151824951172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3717380166053772, "geo/layer_14/attn_entropy_mean": 5.487858772277832, "geo/layer_14/attn_entropy_std": 0.4442140758037567, "geo/layer_21/stable_rank_q_proj": 39.32359313964844, "geo/layer_21/stable_rank_k_proj": 29.00938606262207, "geo/layer_21/stable_rank_o_proj": 66.07398223876953, "geo/layer_21/stable_rank_gate_proj": 62.01666259765625, "geo/layer_21/stable_rank_down_proj": 49.58998489379883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13684619963169098, "geo/layer_21/attn_entropy_mean": 5.84539794921875, "geo/layer_21/attn_entropy_std": 0.31932809948921204, "geo/layer_27/stable_rank_q_proj": 43.449066162109375, "geo/layer_27/stable_rank_k_proj": 30.523473739624023, "geo/layer_27/stable_rank_o_proj": 108.93313598632812, "geo/layer_27/stable_rank_gate_proj": 72.56342315673828, "geo/layer_27/stable_rank_down_proj": 128.2117462158203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09734834730625153, "geo/layer_27/attn_entropy_mean": 4.320303440093994, "geo/layer_27/attn_entropy_std": 0.6650336980819702, "attnres/final_alpha/block_0": 0.25435763597488403, "attnres/block_norm/0": 1.7775306701660156, "attnres/final_alpha/block_1": 0.00403162557631731, "attnres/block_norm/1": 49971.79296875, "attnres/final_alpha/block_2": 0.008614795282483101, "attnres/block_norm/2": 29758.42578125, "attnres/final_alpha/block_3": 0.010454272851347923, "attnres/block_norm/3": 69849.9140625, "attnres/final_alpha/block_4": 0.012346846982836723, "attnres/block_norm/4": 16908.197265625, "attnres/final_alpha/block_5": 0.607083261013031, "attnres/block_norm/5": 7127.166015625, "attnres/final_alpha/block_6": 0.10311155021190643, "attnres/block_norm/6": 46055.4375, "geo/tier1_time_s": 1.3615877628326416, "geo/step": 6225.0, "geo/rankme_slope": 0.0009701839329481792} {"step": 6230, "timestamp": 1778332450.2282815, "train/loss": 2.374575662612915, "train/z_loss": 0.0013475030311383307, "train/perplexity": 10.746452097021203, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702967.5429373286, "perf/iters_per_sec": 0.812038203686394, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2314691543579102, "data/tokens_consumed": 13067354112, "data/tokens_consumed_B": 13.067354112, "train/loss_slope": -1.0987282586176466e-05} {"step": 6240, "timestamp": 1778332460.5814323, "train/loss": 2.3844887018203735, "train/z_loss": 0.0013497128034941851, "train/perplexity": 10.853511865125334, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026961.114191873, "perf/iters_per_sec": 0.9665303774794927, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346286296844482, "data/tokens_consumed": 13088325632, "data/tokens_consumed_B": 13.088325632, "train/loss_slope": -9.946659825208041e-06} {"step": 6250, "timestamp": 1778332470.9279976, "grad/layer_0/attn": 0.003561141900718212, "grad/layer_0/mlp": 0.003507469780743122, "grad/layer_0/attn_mlp_ratio": 1.0153022040958248, "grad/layer_4/attn": 0.0026093758642673492, "grad/layer_4/mlp": 0.002526035998016596, "grad/layer_4/attn_mlp_ratio": 1.0329923100925558, "grad/layer_8/attn": 0.005339137744158506, "grad/layer_8/mlp": 0.0035663798917084932, "grad/layer_8/attn_mlp_ratio": 1.4970748368293656, "grad/layer_12/attn": 0.005026800557971001, "grad/layer_12/mlp": 0.00693588238209486, "grad/layer_12/attn_mlp_ratio": 0.7247528444935164, "grad/layer_16/attn": 0.004238545428961515, "grad/layer_16/mlp": 0.004712178371846676, "grad/layer_16/attn_mlp_ratio": 0.8994874566583329, "grad/layer_20/attn": 0.0027365211863070726, "grad/layer_20/mlp": 0.005553625524044037, "grad/layer_20/attn_mlp_ratio": 0.4927449870692569, "grad/layer_24/attn": 0.009166330099105835, "grad/layer_24/mlp": 0.00898547749966383, "grad/layer_24/attn_mlp_ratio": 1.0201271994099426, "grad/layer_27/attn": 0.011240617372095585, "grad/layer_27/mlp": 0.008475907146930695, "grad/layer_27/attn_mlp_ratio": 1.3261845658075189} {"step": 6250, "timestamp": 1778332470.9438095, "train/loss": 2.3967233896255493, "train/z_loss": 0.0013646612060256302, "train/perplexity": 10.987116835355279, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024613.7633675516, "perf/iters_per_sec": 0.9654110733831175, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035828185081482, "data/tokens_consumed": 13109297152, "data/tokens_consumed_B": 13.109297152, "train/loss_slope": -7.719700607565401e-06} {"step": 6260, "timestamp": 1778332481.3055775, "train/loss": 2.3714077472686768, "train/z_loss": 0.001357300637755543, "train/perplexity": 10.712462113647936, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024876.7665592239, "perf/iters_per_sec": 0.9655364830776328, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035693645477295, "data/tokens_consumed": 13130268672, "data/tokens_consumed_B": 13.130268672, "train/loss_slope": -7.859673243973939e-06} {"step": 6270, "timestamp": 1778332491.6594408, "train/loss": 2.3443307161331175, "train/z_loss": 0.001342343434225768, "train/perplexity": 10.426292242657112, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026828.2823632273, "perf/iters_per_sec": 0.9664670383278023, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346964359283448, "data/tokens_consumed": 13151240192, "data/tokens_consumed_B": 13.151240192, "train/loss_slope": -8.350734413117288e-06} {"step": 6280, "timestamp": 1778332502.0103116, "train/loss": 2.346534776687622, "train/z_loss": 0.0013471424230374397, "train/perplexity": 10.449297765588526, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026974.4730677947, "perf/iters_per_sec": 0.966536747487924, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034621810913086, "data/tokens_consumed": 13172211712, "data/tokens_consumed_B": 13.172211712, "train/loss_slope": -1.1249081765857955e-05} {"step": 6290, "timestamp": 1778332512.3576057, "train/loss": 2.3292275190353395, "train/z_loss": 0.0013504635775461794, "train/perplexity": 10.2700050846256, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027711.0717957963, "perf/iters_per_sec": 0.9668879851321203, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342459678649902, "data/tokens_consumed": 13193183232, "data/tokens_consumed_B": 13.193183232, "train/loss_slope": -1.4380188285857476e-05} {"step": 6300, "timestamp": 1778332523.1050804, "grad/layer_0/attn": 0.0029518448282033205, "grad/layer_0/mlp": 0.003301519900560379, "grad/layer_0/attn_mlp_ratio": 0.8940866109253596, "grad/layer_4/attn": 0.00248118513263762, "grad/layer_4/mlp": 0.0026313106063753366, "grad/layer_4/attn_mlp_ratio": 0.9429464664229955, "grad/layer_8/attn": 0.0036209060344845057, "grad/layer_8/mlp": 0.003511953866109252, "grad/layer_8/attn_mlp_ratio": 1.0310232051520758, "grad/layer_12/attn": 0.005885550752282143, "grad/layer_12/mlp": 0.006396881770342588, "grad/layer_12/attn_mlp_ratio": 0.9200655681276383, "grad/layer_16/attn": 0.0035109168384224176, "grad/layer_16/mlp": 0.004607838112860918, "grad/layer_16/attn_mlp_ratio": 0.7619444685846636, "grad/layer_20/attn": 0.002979049226269126, "grad/layer_20/mlp": 0.006129471119493246, "grad/layer_20/attn_mlp_ratio": 0.4860205912697668, "grad/layer_24/attn": 0.00976580660790205, "grad/layer_24/mlp": 0.010205872356891632, "grad/layer_24/attn_mlp_ratio": 0.9568811141969129, "grad/layer_27/attn": 0.007760294713079929, "grad/layer_27/mlp": 0.010357835330069065, "grad/layer_27/attn_mlp_ratio": 0.7492197347093962} {"step": 6300, "timestamp": 1778332523.7033033, "eos/sharpness": 60.40759086608885, "eos/L0_probe": 2.345104694366455, "eos/L_plus": 2.70710825920105, "eos/L_minus": 2.587177038192749, "eos/grad_norm": 0.18745562434196472, "eos/embed_grad_frac": 0.0675637349486351, "eos/time_s": 0.5949170589447021} {"step": 6300, "timestamp": 1778332523.7328777, "train/loss": 2.3819224357604982, "train/z_loss": 0.0013486874988302589, "train/perplexity": 10.825694574546164, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1844822.3074258997, "perf/iters_per_sec": 0.8796798264626978, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1367772340774536, "data/tokens_consumed": 13214154752, "data/tokens_consumed_B": 13.214154752, "train/loss_slope": -1.1311772504154775e-05} {"step": 6300, "timestamp": 1778332525.0943372, "geo/rankme_last": 427.3332824707031, "geo/layer_0/stable_rank_q_proj": 20.721590042114258, "geo/layer_0/stable_rank_k_proj": 17.18464469909668, "geo/layer_0/stable_rank_o_proj": 45.02943801879883, "geo/layer_0/stable_rank_gate_proj": 128.76058959960938, "geo/layer_0/stable_rank_down_proj": 56.504154205322266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06571508198976517, "geo/layer_0/attn_entropy_mean": 6.2530388832092285, "geo/layer_0/attn_entropy_std": 0.4371626675128937, "geo/layer_7/stable_rank_q_proj": 42.49625015258789, "geo/layer_7/stable_rank_k_proj": 39.09336853027344, "geo/layer_7/stable_rank_o_proj": 89.83056640625, "geo/layer_7/stable_rank_gate_proj": 79.43285369873047, "geo/layer_7/stable_rank_down_proj": 144.0019989013672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39380407333374023, "geo/layer_7/attn_entropy_mean": 4.717722415924072, "geo/layer_7/attn_entropy_std": 0.7620804905891418, "geo/layer_14/stable_rank_q_proj": 51.73698425292969, "geo/layer_14/stable_rank_k_proj": 42.106204986572266, "geo/layer_14/stable_rank_o_proj": 42.50472640991211, "geo/layer_14/stable_rank_gate_proj": 71.91260528564453, "geo/layer_14/stable_rank_down_proj": 127.1492919921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38862887024879456, "geo/layer_14/attn_entropy_mean": 5.508931636810303, "geo/layer_14/attn_entropy_std": 0.44751107692718506, "geo/layer_21/stable_rank_q_proj": 39.38146209716797, "geo/layer_21/stable_rank_k_proj": 28.97840690612793, "geo/layer_21/stable_rank_o_proj": 66.1061019897461, "geo/layer_21/stable_rank_gate_proj": 62.07424545288086, "geo/layer_21/stable_rank_down_proj": 49.577362060546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.135009303689003, "geo/layer_21/attn_entropy_mean": 5.854750633239746, "geo/layer_21/attn_entropy_std": 0.3139342665672302, "geo/layer_27/stable_rank_q_proj": 43.47832107543945, "geo/layer_27/stable_rank_k_proj": 30.63351821899414, "geo/layer_27/stable_rank_o_proj": 108.91748809814453, "geo/layer_27/stable_rank_gate_proj": 72.48120880126953, "geo/layer_27/stable_rank_down_proj": 128.50563049316406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10340524464845657, "geo/layer_27/attn_entropy_mean": 4.326806545257568, "geo/layer_27/attn_entropy_std": 0.6429338455200195, "attnres/final_alpha/block_0": 0.2517486810684204, "attnres/block_norm/0": 1.7775462865829468, "attnres/final_alpha/block_1": 0.003967122174799442, "attnres/block_norm/1": 49766.12890625, "attnres/final_alpha/block_2": 0.008506739512085915, "attnres/block_norm/2": 29749.62890625, "attnres/final_alpha/block_3": 0.010430043563246727, "attnres/block_norm/3": 69752.5, "attnres/final_alpha/block_4": 0.01186903566122055, "attnres/block_norm/4": 16877.1953125, "attnres/final_alpha/block_5": 0.6131294369697571, "attnres/block_norm/5": 7065.9990234375, "attnres/final_alpha/block_6": 0.10034893453121185, "attnres/block_norm/6": 46670.80078125, "geo/tier1_time_s": 1.3571782112121582, "geo/step": 6300.0, "geo/rankme_slope": 0.0009231201269570328} {"step": 6310, "timestamp": 1778332535.4622862, "train/loss": 2.330524730682373, "train/z_loss": 0.00136871172580868, "train/perplexity": 10.283336099540971, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788572.0770266976, "perf/iters_per_sec": 0.8528576264508713, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1725286483764648, "data/tokens_consumed": 13235126272, "data/tokens_consumed_B": 13.235126272, "train/loss_slope": -1.272930074589345e-05} {"step": 6320, "timestamp": 1778332545.8221662, "train/loss": 2.381393051147461, "train/z_loss": 0.001355265046004206, "train/perplexity": 10.81996513508531, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025394.909770371, "perf/iters_per_sec": 0.9657835530139784, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354286909103394, "data/tokens_consumed": 13256097792, "data/tokens_consumed_B": 13.256097792, "train/loss_slope": -1.468592698674933e-05} {"step": 6330, "timestamp": 1778332556.171821, "train/loss": 2.338763403892517, "train/z_loss": 0.001354448834899813, "train/perplexity": 10.36840710007417, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027404.76039745, "perf/iters_per_sec": 0.9667419244754076, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344022274017335, "data/tokens_consumed": 13277069312, "data/tokens_consumed_B": 13.277069312, "train/loss_slope": -1.6133237707697214e-05} {"step": 6340, "timestamp": 1778332566.5277739, "train/loss": 2.3250186920166014, "train/z_loss": 0.001357595541048795, "train/perplexity": 10.226871244851509, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026025.3569289853, "perf/iters_per_sec": 0.9660841736454894, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351064920425415, "data/tokens_consumed": 13298040832, "data/tokens_consumed_B": 13.298040832, "train/loss_slope": -1.7008380263265955e-05} {"step": 6350, "timestamp": 1778332576.876627, "grad/layer_0/attn": 0.002792011247947812, "grad/layer_0/mlp": 0.0031953221186995506, "grad/layer_0/attn_mlp_ratio": 0.8737808135932281, "grad/layer_4/attn": 0.0035444246605038643, "grad/layer_4/mlp": 0.0024849209003150463, "grad/layer_4/attn_mlp_ratio": 1.4263731764730108, "grad/layer_8/attn": 0.003130355617031455, "grad/layer_8/mlp": 0.003338729264214635, "grad/layer_8/attn_mlp_ratio": 0.9375889074997861, "grad/layer_12/attn": 0.013776065781712532, "grad/layer_12/mlp": 0.006486658938229084, "grad/layer_12/attn_mlp_ratio": 2.123753645832682, "grad/layer_16/attn": 0.004828091710805893, "grad/layer_16/mlp": 0.004682083614170551, "grad/layer_16/attn_mlp_ratio": 1.031184405394855, "grad/layer_20/attn": 0.0033274879679083824, "grad/layer_20/mlp": 0.006723608355969191, "grad/layer_20/attn_mlp_ratio": 0.49489615430450196, "grad/layer_24/attn": 0.013505754061043262, "grad/layer_24/mlp": 0.014069156721234322, "grad/layer_24/attn_mlp_ratio": 0.959954760093318, "grad/layer_27/attn": 0.010711852461099625, "grad/layer_27/mlp": 0.01338985189795494, "grad/layer_27/attn_mlp_ratio": 0.7999978239293211} {"step": 6350, "timestamp": 1778332576.8925755, "train/loss": 2.351337742805481, "train/z_loss": 0.0013643390615470708, "train/perplexity": 10.499606106629411, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025266.852957626, "perf/iters_per_sec": 0.9657224907673007, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354941606521606, "data/tokens_consumed": 13319012352, "data/tokens_consumed_B": 13.319012352, "train/loss_slope": -1.8590389398208072e-05} {"step": 6360, "timestamp": 1778332587.243908, "train/loss": 2.3803240776062013, "train/z_loss": 0.0013516367063857615, "train/perplexity": 10.808405058447395, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027329.7623285227, "perf/iters_per_sec": 0.9667061626093496, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344404935836793, "data/tokens_consumed": 13339983872, "data/tokens_consumed_B": 13.339983872, "train/loss_slope": -1.734542760840419e-05} {"step": 6370, "timestamp": 1778332597.5961661, "train/loss": 2.3331341981887816, "train/z_loss": 0.001358226442243904, "train/perplexity": 10.310205172690182, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026771.2596939115, "perf/iters_per_sec": 0.9664398478002126, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034725546836853, "data/tokens_consumed": 13360955392, "data/tokens_consumed_B": 13.360955392, "train/loss_slope": -1.7227172880175636e-05} {"step": 6375, "timestamp": 1778332603.3757477, "eos/sharpness": 59.19048786163329, "eos/L0_probe": 2.3416712284088135, "eos/L_plus": 2.7135956287384033, "eos/L_minus": 2.5616517066955566, "eos/grad_norm": 0.14059561491012573, "eos/embed_grad_frac": 0.11783277988433838, "eos/time_s": 0.6050646305084229} {"step": 6375, "timestamp": 1778332604.7596743, "geo/rankme_last": 427.7783508300781, "geo/layer_0/stable_rank_q_proj": 20.709171295166016, "geo/layer_0/stable_rank_k_proj": 17.214052200317383, "geo/layer_0/stable_rank_o_proj": 45.006591796875, "geo/layer_0/stable_rank_gate_proj": 128.78582763671875, "geo/layer_0/stable_rank_down_proj": 56.471824645996094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07199618965387344, "geo/layer_0/attn_entropy_mean": 6.25042724609375, "geo/layer_0/attn_entropy_std": 0.4389822781085968, "geo/layer_7/stable_rank_q_proj": 42.44727325439453, "geo/layer_7/stable_rank_k_proj": 39.034420013427734, "geo/layer_7/stable_rank_o_proj": 89.85688018798828, "geo/layer_7/stable_rank_gate_proj": 79.42423248291016, "geo/layer_7/stable_rank_down_proj": 143.9981231689453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40394309163093567, "geo/layer_7/attn_entropy_mean": 4.727932929992676, "geo/layer_7/attn_entropy_std": 0.7745610475540161, "geo/layer_14/stable_rank_q_proj": 51.801483154296875, "geo/layer_14/stable_rank_k_proj": 42.21363830566406, "geo/layer_14/stable_rank_o_proj": 42.55923080444336, "geo/layer_14/stable_rank_gate_proj": 71.94815826416016, "geo/layer_14/stable_rank_down_proj": 127.14266204833984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.368309885263443, "geo/layer_14/attn_entropy_mean": 5.510129451751709, "geo/layer_14/attn_entropy_std": 0.4145539402961731, "geo/layer_21/stable_rank_q_proj": 39.40087890625, "geo/layer_21/stable_rank_k_proj": 28.990537643432617, "geo/layer_21/stable_rank_o_proj": 66.13099670410156, "geo/layer_21/stable_rank_gate_proj": 62.07618713378906, "geo/layer_21/stable_rank_down_proj": 49.56676483154297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13574674725532532, "geo/layer_21/attn_entropy_mean": 5.873932838439941, "geo/layer_21/attn_entropy_std": 0.31715652346611023, "geo/layer_27/stable_rank_q_proj": 43.58114242553711, "geo/layer_27/stable_rank_k_proj": 30.621337890625, "geo/layer_27/stable_rank_o_proj": 108.7239761352539, "geo/layer_27/stable_rank_gate_proj": 72.54369354248047, "geo/layer_27/stable_rank_down_proj": 128.44110107421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11167797446250916, "geo/layer_27/attn_entropy_mean": 4.334512710571289, "geo/layer_27/attn_entropy_std": 0.6555953621864319, "attnres/final_alpha/block_0": 0.25204044580459595, "attnres/block_norm/0": 1.7774924039840698, "attnres/final_alpha/block_1": 0.003917808644473553, "attnres/block_norm/1": 49983.9765625, "attnres/final_alpha/block_2": 0.008448239415884018, "attnres/block_norm/2": 29747.48828125, "attnres/final_alpha/block_3": 0.010475574061274529, "attnres/block_norm/3": 70089.6328125, "attnres/final_alpha/block_4": 0.011984608136117458, "attnres/block_norm/4": 16915.03125, "attnres/final_alpha/block_5": 0.6126822829246521, "attnres/block_norm/5": 6999.486328125, "attnres/final_alpha/block_6": 0.10045108199119568, "attnres/block_norm/6": 46584.2734375, "geo/tier1_time_s": 1.362058162689209, "geo/step": 6375.0, "geo/rankme_slope": 0.0008987658735369147} {"step": 6380, "timestamp": 1778332609.9428468, "train/loss": 2.375211548805237, "train/z_loss": 0.0013478895300067962, "train/perplexity": 10.753287790657414, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699509.4335992592, "perf/iters_per_sec": 0.8103892486568733, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2339749097824098, "data/tokens_consumed": 13381926912, "data/tokens_consumed_B": 13.381926912, "train/loss_slope": -1.639663530047009e-05} {"step": 6390, "timestamp": 1778332620.2870162, "train/loss": 2.368250513076782, "train/z_loss": 0.0013593552517704665, "train/perplexity": 10.678693697433143, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028344.1312325697, "perf/iters_per_sec": 0.967189851394925, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339231729507445, "data/tokens_consumed": 13402898432, "data/tokens_consumed_B": 13.402898432, "train/loss_slope": -1.510120514023605e-05} {"step": 6400, "timestamp": 1778332630.6264284, "grad/layer_0/attn": 0.0046075256541371346, "grad/layer_0/mlp": 0.004170933738350868, "grad/layer_0/attn_mlp_ratio": 1.1046748360695382, "grad/layer_4/attn": 0.0024725007824599743, "grad/layer_4/mlp": 0.0027544330805540085, "grad/layer_4/attn_mlp_ratio": 0.8976441323447432, "grad/layer_8/attn": 0.011414716951549053, "grad/layer_8/mlp": 0.0036388214211910963, "grad/layer_8/attn_mlp_ratio": 3.136926855322289, "grad/layer_12/attn": 0.006290790159255266, "grad/layer_12/mlp": 0.0076945009641349316, "grad/layer_12/attn_mlp_ratio": 0.8175696002665405, "grad/layer_16/attn": 0.004882850684225559, "grad/layer_16/mlp": 0.006453626323491335, "grad/layer_16/attn_mlp_ratio": 0.7566057227068306, "grad/layer_20/attn": 0.0038017253391444683, "grad/layer_20/mlp": 0.008778305724263191, "grad/layer_20/attn_mlp_ratio": 0.4330818970371845, "grad/layer_24/attn": 0.018379461020231247, "grad/layer_24/mlp": 0.014977119863033295, "grad/layer_24/attn_mlp_ratio": 1.2271692465304178, "grad/layer_27/attn": 0.021556317806243896, "grad/layer_27/mlp": 0.014087814837694168, "grad/layer_27/attn_mlp_ratio": 1.5301391948702119} {"step": 6400, "timestamp": 1778332630.6423204, "train/loss": 2.368081259727478, "train/z_loss": 0.001356311806011945, "train/perplexity": 10.676886445704676, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026148.9358530012, "perf/iters_per_sec": 0.9661431006684309, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350433588027954, "data/tokens_consumed": 13423869952, "data/tokens_consumed_B": 13.423869952, "train/loss_slope": -1.6660141894812408e-05} {"step": 6410, "timestamp": 1778332640.9925287, "train/loss": 2.3740663528442383, "train/z_loss": 0.0013673339039087296, "train/perplexity": 10.740980217548692, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027466.071352129, "perf/iters_per_sec": 0.9667711598168035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343709468841553, "data/tokens_consumed": 13444841472, "data/tokens_consumed_B": 13.444841472, "train/loss_slope": -1.4983324714154826e-05} {"step": 6420, "timestamp": 1778332651.34871, "train/loss": 2.3615226984024047, "train/z_loss": 0.0013494591345079242, "train/perplexity": 10.607090561669914, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025999.690991788, "perf/iters_per_sec": 0.9660719351729335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035119605064392, "data/tokens_consumed": 13465812992, "data/tokens_consumed_B": 13.465812992, "train/loss_slope": -1.2155958799996883e-05} {"step": 6430, "timestamp": 1778332661.700834, "train/loss": 2.323184299468994, "train/z_loss": 0.0013556489837355911, "train/perplexity": 10.20812834462867, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026611.1837129644, "perf/iters_per_sec": 0.9663635176243612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034807276725769, "data/tokens_consumed": 13486784512, "data/tokens_consumed_B": 13.486784512, "train/loss_slope": -1.5940445763478995e-05} {"step": 6440, "timestamp": 1778332672.530033, "train/loss": 2.360064387321472, "train/z_loss": 0.0013630526955239475, "train/perplexity": 10.591633397383713, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1937518.1290081348, "perf/iters_per_sec": 0.9238806386032747, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0823909044265747, "data/tokens_consumed": 13507756032, "data/tokens_consumed_B": 13.507756032, "train/loss_slope": -1.4515968103005427e-05} {"step": 6450, "timestamp": 1778332682.872577, "grad/layer_0/attn": 0.003722599009051919, "grad/layer_0/mlp": 0.003677729517221451, "grad/layer_0/attn_mlp_ratio": 1.0122002965145565, "grad/layer_4/attn": 0.0018883600132539868, "grad/layer_4/mlp": 0.0026588754262775183, "grad/layer_4/attn_mlp_ratio": 0.7102100096794417, "grad/layer_8/attn": 0.00318330735899508, "grad/layer_8/mlp": 0.0034443459007889032, "grad/layer_8/attn_mlp_ratio": 0.9242124218257881, "grad/layer_12/attn": 0.00586369913071394, "grad/layer_12/mlp": 0.007331766653805971, "grad/layer_12/attn_mlp_ratio": 0.7997661856427788, "grad/layer_16/attn": 0.006954030599445105, "grad/layer_16/mlp": 0.004780934192240238, "grad/layer_16/attn_mlp_ratio": 1.4545338158551857, "grad/layer_20/attn": 0.0028853281401097775, "grad/layer_20/mlp": 0.006437705829739571, "grad/layer_20/attn_mlp_ratio": 0.4481919757752122, "grad/layer_24/attn": 0.011493959464132786, "grad/layer_24/mlp": 0.011140480637550354, "grad/layer_24/attn_mlp_ratio": 1.031729216620876, "grad/layer_27/attn": 0.011109261773526669, "grad/layer_27/mlp": 0.012856252491474152, "grad/layer_27/attn_mlp_ratio": 0.8641135272104074} {"step": 6450, "timestamp": 1778332683.4859438, "eos/sharpness": 63.468861579895005, "eos/L0_probe": 2.3397557735443115, "eos/L_plus": 2.6280999183654785, "eos/L_minus": 2.6861002445220947, "eos/grad_norm": 0.21504609286785126, "eos/embed_grad_frac": 0.05767776817083359, "eos/time_s": 0.6103966236114502} {"step": 6450, "timestamp": 1778332683.506072, "train/loss": 2.342880892753601, "train/z_loss": 0.0013541484950110316, "train/perplexity": 10.41118691307796, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911605.7762499969, "perf/iters_per_sec": 0.9115246659517273, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0970630168914794, "data/tokens_consumed": 13528727552, "data/tokens_consumed_B": 13.528727552, "train/loss_slope": -1.4532644212907692e-05} {"step": 6450, "timestamp": 1778332684.8716059, "geo/rankme_last": 427.1493225097656, "geo/layer_0/stable_rank_q_proj": 20.734180450439453, "geo/layer_0/stable_rank_k_proj": 17.173547744750977, "geo/layer_0/stable_rank_o_proj": 44.98862838745117, "geo/layer_0/stable_rank_gate_proj": 128.41513061523438, "geo/layer_0/stable_rank_down_proj": 56.585235595703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0625067725777626, "geo/layer_0/attn_entropy_mean": 6.2512922286987305, "geo/layer_0/attn_entropy_std": 0.4393298327922821, "geo/layer_7/stable_rank_q_proj": 42.45969772338867, "geo/layer_7/stable_rank_k_proj": 38.99168014526367, "geo/layer_7/stable_rank_o_proj": 89.8446273803711, "geo/layer_7/stable_rank_gate_proj": 79.32815551757812, "geo/layer_7/stable_rank_down_proj": 144.13783264160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4222741723060608, "geo/layer_7/attn_entropy_mean": 4.741277694702148, "geo/layer_7/attn_entropy_std": 0.7498086094856262, "geo/layer_14/stable_rank_q_proj": 51.83829879760742, "geo/layer_14/stable_rank_k_proj": 42.290218353271484, "geo/layer_14/stable_rank_o_proj": 42.53976058959961, "geo/layer_14/stable_rank_gate_proj": 71.95528411865234, "geo/layer_14/stable_rank_down_proj": 127.23371124267578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3731708824634552, "geo/layer_14/attn_entropy_mean": 5.540910720825195, "geo/layer_14/attn_entropy_std": 0.44652193784713745, "geo/layer_21/stable_rank_q_proj": 39.343955993652344, "geo/layer_21/stable_rank_k_proj": 29.02548599243164, "geo/layer_21/stable_rank_o_proj": 66.12063598632812, "geo/layer_21/stable_rank_gate_proj": 61.99408721923828, "geo/layer_21/stable_rank_down_proj": 49.503482818603516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13615375757217407, "geo/layer_21/attn_entropy_mean": 5.862339496612549, "geo/layer_21/attn_entropy_std": 0.31478357315063477, "geo/layer_27/stable_rank_q_proj": 43.68943405151367, "geo/layer_27/stable_rank_k_proj": 30.65705680847168, "geo/layer_27/stable_rank_o_proj": 108.53184509277344, "geo/layer_27/stable_rank_gate_proj": 72.54598236083984, "geo/layer_27/stable_rank_down_proj": 128.211669921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09327121824026108, "geo/layer_27/attn_entropy_mean": 4.333051681518555, "geo/layer_27/attn_entropy_std": 0.6506881713867188, "attnres/final_alpha/block_0": 0.2559855580329895, "attnres/block_norm/0": 1.777724266052246, "attnres/final_alpha/block_1": 0.004048437811434269, "attnres/block_norm/1": 49803.4375, "attnres/final_alpha/block_2": 0.008690789341926575, "attnres/block_norm/2": 29804.14453125, "attnres/final_alpha/block_3": 0.01071847416460514, "attnres/block_norm/3": 69811.96875, "attnres/final_alpha/block_4": 0.012153059244155884, "attnres/block_norm/4": 16948.9296875, "attnres/final_alpha/block_5": 0.6055358648300171, "attnres/block_norm/5": 7081.61279296875, "attnres/final_alpha/block_6": 0.10286780446767807, "attnres/block_norm/6": 46581.50390625, "geo/tier1_time_s": 1.3616950511932373, "geo/step": 6450.0, "geo/rankme_slope": 0.0008539335460746799} {"step": 6460, "timestamp": 1778332695.2298465, "train/loss": 2.380238080024719, "train/z_loss": 0.0013413239386864007, "train/perplexity": 10.807475601718776, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789364.7480987182, "perf/iters_per_sec": 0.8532356014722434, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1720092296600342, "data/tokens_consumed": 13549699072, "data/tokens_consumed_B": 13.549699072, "train/loss_slope": -1.22642806797389e-05} {"step": 6470, "timestamp": 1778332705.584647, "train/loss": 2.3475053310394287, "train/z_loss": 0.0013580555794760584, "train/perplexity": 10.45944430009337, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026558.842298803, "perf/iters_per_sec": 0.9663385592931761, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348340034484864, "data/tokens_consumed": 13570670592, "data/tokens_consumed_B": 13.570670592, "train/loss_slope": -1.3959301565990415e-05} {"step": 6480, "timestamp": 1778332715.9306486, "train/loss": 2.3956870317459105, "train/z_loss": 0.0013434990192763506, "train/perplexity": 10.975736148499301, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027936.1205512737, "perf/iters_per_sec": 0.9669952967411393, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341311931610107, "data/tokens_consumed": 13591642112, "data/tokens_consumed_B": 13.591642112, "train/loss_slope": -1.1077121153201216e-05} {"step": 6490, "timestamp": 1778332726.2774217, "train/loss": 2.421548366546631, "train/z_loss": 0.0013372771674767137, "train/perplexity": 11.263285514824117, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027763.753222839, "perf/iters_per_sec": 0.9669131055940814, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342190980911254, "data/tokens_consumed": 13612613632, "data/tokens_consumed_B": 13.612613632, "train/loss_slope": -6.141432276581918e-06} {"step": 6500, "timestamp": 1778332737.1501696, "grad/layer_0/attn": 0.00270223431289196, "grad/layer_0/mlp": 0.002987986197695136, "grad/layer_0/attn_mlp_ratio": 0.9043663670668107, "grad/layer_4/attn": 0.0024917200207710266, "grad/layer_4/mlp": 0.002830826910212636, "grad/layer_4/attn_mlp_ratio": 0.8802092151098496, "grad/layer_8/attn": 0.003869650885462761, "grad/layer_8/mlp": 0.0035484451800584793, "grad/layer_8/attn_mlp_ratio": 1.0905200954371173, "grad/layer_12/attn": 0.007258928846567869, "grad/layer_12/mlp": 0.00757122877985239, "grad/layer_12/attn_mlp_ratio": 0.9587517378961328, "grad/layer_16/attn": 0.0037301573902368546, "grad/layer_16/mlp": 0.004460653755813837, "grad/layer_16/attn_mlp_ratio": 0.8362355633973072, "grad/layer_20/attn": 0.004331375006586313, "grad/layer_20/mlp": 0.006149042397737503, "grad/layer_20/attn_mlp_ratio": 0.7043982877301649, "grad/layer_24/attn": 0.0062379054725170135, "grad/layer_24/mlp": 0.008679887279868126, "grad/layer_24/attn_mlp_ratio": 0.7186620286093834, "grad/layer_27/attn": 0.009754274040460587, "grad/layer_27/mlp": 0.008441724814474583, "grad/layer_27/attn_mlp_ratio": 1.1554835225364244} {"step": 6500, "timestamp": 1778332737.166275, "train/loss": 2.327285575866699, "train/z_loss": 0.0013476012973114848, "train/perplexity": 10.250080670710839, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926915.265215306, "perf/iters_per_sec": 0.9188247991634875, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0883467674255372, "data/tokens_consumed": 13633585152, "data/tokens_consumed_B": 13.633585152, "train/loss_slope": -8.264665563579445e-06} {"step": 6500, "timestamp": 1778332744.060186, "geo/ww_alpha_mean": 7.830775262482453, "geo/ww_alpha_std": 4.946439029593625, "geo/ww_alpha_min": 1.3580719766427716, "geo/ww_alpha_max": 34.32910360542519, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9295941924703115, "geo/ww_alpha_by_type/k_proj": 4.358672316079479, "geo/ww_alpha_by_type/v_proj": 10.072424424404714, "geo/ww_alpha_by_type/o_proj": 8.510329344871582, "geo/ww_alpha_by_type/gate_proj": 7.990298596377707, "geo/ww_alpha_by_type/up_proj": 12.071495732484612, "geo/ww_alpha_by_type/down_proj": 7.984902536537514, "geo/twonn_id/layer_0": 0.7287420630455017, "geo/twonn_id/layer_7": 3.7692694664001465, "geo/twonn_id/layer_14": 5.640675067901611, "geo/twonn_id/layer_21": 7.275345325469971, "geo/twonn_id/layer_27": 6.610953330993652, "geo/tier2_time_s": 6.887618541717529} {"step": 6500, "timestamp": 1778332744.8212302, "eoc/jacobian_sigma/layer_0/attn": 1374.8114013671875, "eoc/jacobian_sigma/layer_0/mlp": 11551.7255859375, "eoc/jacobian_sigma/layer_0": 11551.7255859375, "eoc/jacobian_sigma/layer_7/attn": 1.1412466764450073, "eoc/jacobian_sigma/layer_7/mlp": 1.8402745723724365, "eoc/jacobian_sigma/layer_7": 1.8402745723724365, "eoc/jacobian_sigma/layer_14/attn": 1.8162188529968262, "eoc/jacobian_sigma/layer_14/mlp": 12.417859077453613, "eoc/jacobian_sigma/layer_14": 12.417859077453613, "eoc/jacobian_sigma/layer_21/attn": 1.0960203409194946, "eoc/jacobian_sigma/layer_21/mlp": 5.150986671447754, "eoc/jacobian_sigma/layer_21": 5.150986671447754, "eoc/jacobian_sigma/layer_27/attn": 3.6686036586761475, "eoc/jacobian_sigma/layer_27/mlp": 33.89448165893555, "eoc/jacobian_sigma/layer_27": 33.89448165893555, "eoc/layer0_sigma": 11551.7255859375, "eoc/sigma_max": 33.89448165893555, "eoc/sigma_min": 1.8402745723724365, "eoc/sigma_mean": 13.325900495052338, "eoc/time_s": 0.7547948360443115} {"step": 6510, "timestamp": 1778332755.1915748, "train/loss": 2.330688238143921, "train/z_loss": 0.0013560778112150728, "train/perplexity": 10.285017639191246, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163776.5624240316, "perf/iters_per_sec": 0.5549319088096769, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.802022886276245, "data/tokens_consumed": 13654556672, "data/tokens_consumed_B": 13.654556672, "train/loss_slope": -7.311818008125355e-06} {"step": 6520, "timestamp": 1778332765.53906, "train/loss": 2.37803909778595, "train/z_loss": 0.001361057732719928, "train/perplexity": 10.783736265579936, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027819.7565346411, "perf/iters_per_sec": 0.9669398100541311, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341905355453491, "data/tokens_consumed": 13675528192, "data/tokens_consumed_B": 13.675528192, "train/loss_slope": -5.04815712226899e-06} {"step": 6525, "timestamp": 1778332771.3147073, "eos/sharpness": 61.2711191177368, "eos/L0_probe": 2.3428523540496826, "eos/L_plus": 2.620692491531372, "eos/L_minus": 2.6777234077453613, "eos/grad_norm": 0.19721125066280365, "eos/embed_grad_frac": 0.05891682580113411, "eos/time_s": 0.6099996566772461} {"step": 6525, "timestamp": 1778332772.6938465, "geo/rankme_last": 428.3768615722656, "geo/layer_0/stable_rank_q_proj": 20.755346298217773, "geo/layer_0/stable_rank_k_proj": 17.1779727935791, "geo/layer_0/stable_rank_o_proj": 45.09325408935547, "geo/layer_0/stable_rank_gate_proj": 128.40541076660156, "geo/layer_0/stable_rank_down_proj": 56.580963134765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06552909314632416, "geo/layer_0/attn_entropy_mean": 6.252450942993164, "geo/layer_0/attn_entropy_std": 0.43573805689811707, "geo/layer_7/stable_rank_q_proj": 42.32294464111328, "geo/layer_7/stable_rank_k_proj": 39.06122589111328, "geo/layer_7/stable_rank_o_proj": 89.724609375, "geo/layer_7/stable_rank_gate_proj": 79.2520523071289, "geo/layer_7/stable_rank_down_proj": 144.3197784423828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4012219309806824, "geo/layer_7/attn_entropy_mean": 4.731122970581055, "geo/layer_7/attn_entropy_std": 0.7591243982315063, "geo/layer_14/stable_rank_q_proj": 51.83020782470703, "geo/layer_14/stable_rank_k_proj": 42.1976432800293, "geo/layer_14/stable_rank_o_proj": 42.573997497558594, "geo/layer_14/stable_rank_gate_proj": 72.00092315673828, "geo/layer_14/stable_rank_down_proj": 127.31682586669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36169934272766113, "geo/layer_14/attn_entropy_mean": 5.5178422927856445, "geo/layer_14/attn_entropy_std": 0.43714335560798645, "geo/layer_21/stable_rank_q_proj": 39.28102111816406, "geo/layer_21/stable_rank_k_proj": 29.007728576660156, "geo/layer_21/stable_rank_o_proj": 66.05087280273438, "geo/layer_21/stable_rank_gate_proj": 61.966651916503906, "geo/layer_21/stable_rank_down_proj": 49.475921630859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13535794615745544, "geo/layer_21/attn_entropy_mean": 5.8585429191589355, "geo/layer_21/attn_entropy_std": 0.32247626781463623, "geo/layer_27/stable_rank_q_proj": 43.705467224121094, "geo/layer_27/stable_rank_k_proj": 30.655858993530273, "geo/layer_27/stable_rank_o_proj": 108.37635040283203, "geo/layer_27/stable_rank_gate_proj": 72.46354675292969, "geo/layer_27/stable_rank_down_proj": 128.03546142578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09417276829481125, "geo/layer_27/attn_entropy_mean": 4.3365631103515625, "geo/layer_27/attn_entropy_std": 0.6650758981704712, "attnres/final_alpha/block_0": 0.2544282078742981, "attnres/block_norm/0": 1.7776758670806885, "attnres/final_alpha/block_1": 0.0040331147611141205, "attnres/block_norm/1": 49992.28515625, "attnres/final_alpha/block_2": 0.008656775578856468, "attnres/block_norm/2": 29811.86328125, "attnres/final_alpha/block_3": 0.01060809288173914, "attnres/block_norm/3": 69953.75, "attnres/final_alpha/block_4": 0.012232968583703041, "attnres/block_norm/4": 16885.921875, "attnres/final_alpha/block_5": 0.6069530248641968, "attnres/block_norm/5": 7108.986328125, "attnres/final_alpha/block_6": 0.10308781266212463, "attnres/block_norm/6": 46431.578125, "geo/tier1_time_s": 1.3600261211395264, "geo/step": 6525.0, "geo/rankme_slope": 0.0008286384866446579} {"step": 6530, "timestamp": 1778332777.8707583, "train/loss": 2.3657527208328246, "train/z_loss": 0.001352797111030668, "train/perplexity": 10.652053823425872, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701188.4466946379, "perf/iters_per_sec": 0.8111898644898595, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.232757019996643, "data/tokens_consumed": 13696499712, "data/tokens_consumed_B": 13.696499712, "train/loss_slope": -3.2536756588180723e-06} {"step": 6540, "timestamp": 1778332788.2188776, "train/loss": 2.3292612791061402, "train/z_loss": 0.0013629134045913815, "train/perplexity": 10.270351806577027, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027860.3351573201, "perf/iters_per_sec": 0.9669591594492531, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341698408126831, "data/tokens_consumed": 13717471232, "data/tokens_consumed_B": 13.717471232, "train/loss_slope": -7.895423884582103e-06} {"step": 6550, "timestamp": 1778332798.5569065, "grad/layer_0/attn": 0.0023820900823920965, "grad/layer_0/mlp": 0.0029065581038594246, "grad/layer_0/attn_mlp_ratio": 0.8195569864140612, "grad/layer_4/attn": 0.002001268556341529, "grad/layer_4/mlp": 0.0025047974195331335, "grad/layer_4/attn_mlp_ratio": 0.7989741848333286, "grad/layer_8/attn": 0.0031183829996734858, "grad/layer_8/mlp": 0.0031332748476415873, "grad/layer_8/attn_mlp_ratio": 0.9952471620853731, "grad/layer_12/attn": 0.008372175507247448, "grad/layer_12/mlp": 0.006793648470193148, "grad/layer_12/attn_mlp_ratio": 1.2323533401447968, "grad/layer_16/attn": 0.003958264831453562, "grad/layer_16/mlp": 0.004481019452214241, "grad/layer_16/attn_mlp_ratio": 0.8833402276715427, "grad/layer_20/attn": 0.0042425948195159435, "grad/layer_20/mlp": 0.0061178142204880714, "grad/layer_20/attn_mlp_ratio": 0.6934821158772068, "grad/layer_24/attn": 0.009479908272624016, "grad/layer_24/mlp": 0.010215782560408115, "grad/layer_24/attn_mlp_ratio": 0.9279669103928743, "grad/layer_27/attn": 0.005745193921029568, "grad/layer_27/mlp": 0.009467982687056065, "grad/layer_27/attn_mlp_ratio": 0.6068023200130843} {"step": 6550, "timestamp": 1778332798.5726864, "train/loss": 2.338779330253601, "train/z_loss": 0.0013495201244950294, "train/perplexity": 10.368572232384489, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026543.901445577, "perf/iters_per_sec": 0.9663314349391827, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348416328430177, "data/tokens_consumed": 13738442752, "data/tokens_consumed_B": 13.738442752, "train/loss_slope": -7.836145285976396e-06} {"step": 6560, "timestamp": 1778332808.94736, "train/loss": 2.3528189182281496, "train/z_loss": 0.0013551959535107016, "train/perplexity": 10.515169388272051, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022419.6556436995, "perf/iters_per_sec": 0.9643648412912843, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369519472122193, "data/tokens_consumed": 13759414272, "data/tokens_consumed_B": 13.759414272, "train/loss_slope": -8.061377379116737e-06} {"step": 6570, "timestamp": 1778332819.2980735, "train/loss": 2.3355395555496217, "train/z_loss": 0.001361250039190054, "train/perplexity": 10.335034750626647, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027457.472641023, "perf/iters_per_sec": 0.9667670596318355, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343753337860107, "data/tokens_consumed": 13780385792, "data/tokens_consumed_B": 13.780385792, "train/loss_slope": -8.742413877045998e-06} {"step": 6580, "timestamp": 1778332829.6482375, "train/loss": 2.37675199508667, "train/z_loss": 0.001368883962277323, "train/perplexity": 10.76986541804175, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027483.7830575649, "perf/iters_per_sec": 0.9667796054160904, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343619108200073, "data/tokens_consumed": 13801357312, "data/tokens_consumed_B": 13.801357312, "train/loss_slope": -7.523500615327983e-06} {"step": 6590, "timestamp": 1778332840.0002282, "train/loss": 2.3567500829696657, "train/z_loss": 0.0013553556869737803, "train/perplexity": 10.556587609006854, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026730.8180419807, "perf/iters_per_sec": 0.9664205637178329, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347461938858031, "data/tokens_consumed": 13822328832, "data/tokens_consumed_B": 13.822328832, "train/loss_slope": -6.856656611019404e-06} {"step": 6600, "timestamp": 1778332850.3509612, "grad/layer_0/attn": 0.0034577054902911186, "grad/layer_0/mlp": 0.00330652785487473, "grad/layer_0/attn_mlp_ratio": 1.045720930680024, "grad/layer_4/attn": 0.0022791672963649035, "grad/layer_4/mlp": 0.002616699319332838, "grad/layer_4/attn_mlp_ratio": 0.871008446566632, "grad/layer_8/attn": 0.0038916293997317553, "grad/layer_8/mlp": 0.003394427942112088, "grad/layer_8/attn_mlp_ratio": 1.146475739491681, "grad/layer_12/attn": 0.006659338250756264, "grad/layer_12/mlp": 0.006993426010012627, "grad/layer_12/attn_mlp_ratio": 0.9522282992626401, "grad/layer_16/attn": 0.0034268293529748917, "grad/layer_16/mlp": 0.004764505662024021, "grad/layer_16/attn_mlp_ratio": 0.7192413073122473, "grad/layer_20/attn": 0.003700435161590576, "grad/layer_20/mlp": 0.007100821007043123, "grad/layer_20/attn_mlp_ratio": 0.5211277830841575, "grad/layer_24/attn": 0.013125557452440262, "grad/layer_24/mlp": 0.011525586247444153, "grad/layer_24/attn_mlp_ratio": 1.1388190636696682, "grad/layer_27/attn": 0.008544756099581718, "grad/layer_27/mlp": 0.011938202194869518, "grad/layer_27/attn_mlp_ratio": 0.7157489786593628} {"step": 6600, "timestamp": 1778332850.9492407, "eos/sharpness": 55.65576553344725, "eos/L0_probe": 2.338811159133911, "eos/L_plus": 2.603391170501709, "eos/L_minus": 2.630788803100586, "eos/grad_norm": 0.21648402512073517, "eos/embed_grad_frac": 0.05853335186839104, "eos/time_s": 0.5954587459564209} {"step": 6600, "timestamp": 1778332850.9706895, "train/loss": 2.381925368309021, "train/z_loss": 0.0013423225376755, "train/perplexity": 10.825726321467346, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913206.1797402527, "perf/iters_per_sec": 0.9122877978039993, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0961453199386597, "data/tokens_consumed": 13843300352, "data/tokens_consumed_B": 13.843300352, "train/loss_slope": -7.4269515345700015e-06} {"step": 6600, "timestamp": 1778332852.3327467, "geo/rankme_last": 427.8970642089844, "geo/layer_0/stable_rank_q_proj": 20.773101806640625, "geo/layer_0/stable_rank_k_proj": 17.154190063476562, "geo/layer_0/stable_rank_o_proj": 45.0372200012207, "geo/layer_0/stable_rank_gate_proj": 128.52183532714844, "geo/layer_0/stable_rank_down_proj": 56.574188232421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06523511558771133, "geo/layer_0/attn_entropy_mean": 6.248563766479492, "geo/layer_0/attn_entropy_std": 0.4301799237728119, "geo/layer_7/stable_rank_q_proj": 42.177162170410156, "geo/layer_7/stable_rank_k_proj": 39.08196258544922, "geo/layer_7/stable_rank_o_proj": 89.56280517578125, "geo/layer_7/stable_rank_gate_proj": 79.35066986083984, "geo/layer_7/stable_rank_down_proj": 144.2845458984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4174502193927765, "geo/layer_7/attn_entropy_mean": 4.747310638427734, "geo/layer_7/attn_entropy_std": 0.7551872730255127, "geo/layer_14/stable_rank_q_proj": 51.8839225769043, "geo/layer_14/stable_rank_k_proj": 42.168495178222656, "geo/layer_14/stable_rank_o_proj": 42.526607513427734, "geo/layer_14/stable_rank_gate_proj": 72.05726623535156, "geo/layer_14/stable_rank_down_proj": 127.10553741455078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3661930561065674, "geo/layer_14/attn_entropy_mean": 5.52830696105957, "geo/layer_14/attn_entropy_std": 0.4257204830646515, "geo/layer_21/stable_rank_q_proj": 39.308902740478516, "geo/layer_21/stable_rank_k_proj": 29.05084228515625, "geo/layer_21/stable_rank_o_proj": 65.9655532836914, "geo/layer_21/stable_rank_gate_proj": 61.9559326171875, "geo/layer_21/stable_rank_down_proj": 49.59236526489258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13784785568714142, "geo/layer_21/attn_entropy_mean": 5.8656086921691895, "geo/layer_21/attn_entropy_std": 0.31160688400268555, "geo/layer_27/stable_rank_q_proj": 43.77001190185547, "geo/layer_27/stable_rank_k_proj": 30.66262435913086, "geo/layer_27/stable_rank_o_proj": 108.27024841308594, "geo/layer_27/stable_rank_gate_proj": 72.3777084350586, "geo/layer_27/stable_rank_down_proj": 127.93914031982422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08933503180742264, "geo/layer_27/attn_entropy_mean": 4.347476005554199, "geo/layer_27/attn_entropy_std": 0.6451900601387024, "attnres/final_alpha/block_0": 0.25610995292663574, "attnres/block_norm/0": 1.777626395225525, "attnres/final_alpha/block_1": 0.004073023330420256, "attnres/block_norm/1": 50004.109375, "attnres/final_alpha/block_2": 0.008724518120288849, "attnres/block_norm/2": 29784.4296875, "attnres/final_alpha/block_3": 0.01079072430729866, "attnres/block_norm/3": 69735.96875, "attnres/final_alpha/block_4": 0.012236282229423523, "attnres/block_norm/4": 16978.134765625, "attnres/final_alpha/block_5": 0.604597806930542, "attnres/block_norm/5": 7161.9970703125, "attnres/final_alpha/block_6": 0.10346771776676178, "attnres/block_norm/6": 46349.61328125, "geo/tier1_time_s": 1.3578603267669678, "geo/step": 6600.0, "geo/rankme_slope": 0.0007967717946553622} {"step": 6610, "timestamp": 1778332862.708715, "train/loss": 2.405758500099182, "train/z_loss": 0.0013511569006368518, "train/perplexity": 11.086836460294688, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787255.2416726134, "perf/iters_per_sec": 0.8522297104228084, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1733925580978393, "data/tokens_consumed": 13864271872, "data/tokens_consumed_B": 13.864271872, "train/loss_slope": -5.400326829252789e-06} {"step": 6620, "timestamp": 1778332873.0591674, "train/loss": 2.3563798666000366, "train/z_loss": 0.0013437407556921243, "train/perplexity": 10.552680110821102, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027193.0978741338, "perf/iters_per_sec": 0.9666409959192914, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345102310180665, "data/tokens_consumed": 13885243392, "data/tokens_consumed_B": 13.885243392, "train/loss_slope": -4.535775981505954e-06} {"step": 6630, "timestamp": 1778332883.4259298, "train/loss": 2.340333890914917, "train/z_loss": 0.0013599745579995215, "train/perplexity": 10.384703342036596, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024113.765728338, "perf/iters_per_sec": 0.9651726559297266, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360840559005737, "data/tokens_consumed": 13906214912, "data/tokens_consumed_B": 13.906214912, "train/loss_slope": -5.478614820863194e-06} {"step": 6640, "timestamp": 1778332893.7790895, "train/loss": 2.3375365257263185, "train/z_loss": 0.0013721565133891999, "train/perplexity": 10.35569412801342, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026899.320004591, "perf/iters_per_sec": 0.9665009117148357, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346601724624633, "data/tokens_consumed": 13927186432, "data/tokens_consumed_B": 13.927186432, "train/loss_slope": -8.675578976049454e-06} {"step": 6650, "timestamp": 1778332904.1315975, "grad/layer_0/attn": 0.0029473225586116314, "grad/layer_0/mlp": 0.0033860679250210524, "grad/layer_0/attn_mlp_ratio": 0.8704262692989775, "grad/layer_4/attn": 0.0017851028824225068, "grad/layer_4/mlp": 0.0026770883705466986, "grad/layer_4/attn_mlp_ratio": 0.6668075792272805, "grad/layer_8/attn": 0.004344433080404997, "grad/layer_8/mlp": 0.0034517417661845684, "grad/layer_8/attn_mlp_ratio": 1.25862050200387, "grad/layer_12/attn": 0.004879309330135584, "grad/layer_12/mlp": 0.00656931521371007, "grad/layer_12/attn_mlp_ratio": 0.7427424468349896, "grad/layer_16/attn": 0.003329601138830185, "grad/layer_16/mlp": 0.004314883146435022, "grad/layer_16/attn_mlp_ratio": 0.7716549785167693, "grad/layer_20/attn": 0.0035813564900308847, "grad/layer_20/mlp": 0.005961093585938215, "grad/layer_20/attn_mlp_ratio": 0.6007884926350081, "grad/layer_24/attn": 0.007046719081699848, "grad/layer_24/mlp": 0.008067144080996513, "grad/layer_24/attn_mlp_ratio": 0.8735085085375759, "grad/layer_27/attn": 0.0059392801485955715, "grad/layer_27/mlp": 0.008494868874549866, "grad/layer_27/attn_mlp_ratio": 0.6991608895192275} {"step": 6650, "timestamp": 1778332904.1473405, "train/loss": 2.3411182880401613, "train/z_loss": 0.0013573934091255068, "train/perplexity": 10.392852269064248, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023769.892850156, "perf/iters_per_sec": 0.9650086845637111, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362601041793824, "data/tokens_consumed": 13948157952, "data/tokens_consumed_B": 13.948157952, "train/loss_slope": -1.118814845790456e-05} {"step": 6660, "timestamp": 1778332914.5017054, "train/loss": 2.354250764846802, "train/z_loss": 0.001356603263411671, "train/perplexity": 10.530236282171586, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026870.3625739166, "perf/iters_per_sec": 0.966487103735884, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346749544143676, "data/tokens_consumed": 13969129472, "data/tokens_consumed_B": 13.969129472, "train/loss_slope": -9.389922113129573e-06} {"step": 6670, "timestamp": 1778332924.8494089, "train/loss": 2.3574498176574705, "train/z_loss": 0.0013564507942646741, "train/perplexity": 10.563977004548413, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027744.7277599375, "perf/iters_per_sec": 0.966904033546418, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034228801727295, "data/tokens_consumed": 13990100992, "data/tokens_consumed_B": 13.990100992, "train/loss_slope": -9.884068317110028e-06} {"step": 6675, "timestamp": 1778332930.6115801, "eos/sharpness": 68.39001178741454, "eos/L0_probe": 2.342099905014038, "eos/L_plus": 2.6472883224487305, "eos/L_minus": 2.720811605453491, "eos/grad_norm": 0.2319735884666443, "eos/embed_grad_frac": 0.04567822813987732, "eos/time_s": 0.5994551181793213} {"step": 6675, "timestamp": 1778332931.993752, "geo/rankme_last": 428.34844970703125, "geo/layer_0/stable_rank_q_proj": 20.76606559753418, "geo/layer_0/stable_rank_k_proj": 17.1678466796875, "geo/layer_0/stable_rank_o_proj": 45.0107421875, "geo/layer_0/stable_rank_gate_proj": 128.33364868164062, "geo/layer_0/stable_rank_down_proj": 56.60846710205078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06414110958576202, "geo/layer_0/attn_entropy_mean": 6.247066974639893, "geo/layer_0/attn_entropy_std": 0.4333456754684448, "geo/layer_7/stable_rank_q_proj": 42.140220642089844, "geo/layer_7/stable_rank_k_proj": 39.04338073730469, "geo/layer_7/stable_rank_o_proj": 89.61399841308594, "geo/layer_7/stable_rank_gate_proj": 79.24993896484375, "geo/layer_7/stable_rank_down_proj": 144.2938995361328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4044811725616455, "geo/layer_7/attn_entropy_mean": 4.712920665740967, "geo/layer_7/attn_entropy_std": 0.7668743133544922, "geo/layer_14/stable_rank_q_proj": 51.87385940551758, "geo/layer_14/stable_rank_k_proj": 42.24596405029297, "geo/layer_14/stable_rank_o_proj": 42.51202392578125, "geo/layer_14/stable_rank_gate_proj": 72.03823852539062, "geo/layer_14/stable_rank_down_proj": 127.13321685791016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38441213965415955, "geo/layer_14/attn_entropy_mean": 5.5209808349609375, "geo/layer_14/attn_entropy_std": 0.4387156367301941, "geo/layer_21/stable_rank_q_proj": 39.385501861572266, "geo/layer_21/stable_rank_k_proj": 29.013946533203125, "geo/layer_21/stable_rank_o_proj": 66.0303726196289, "geo/layer_21/stable_rank_gate_proj": 61.87773895263672, "geo/layer_21/stable_rank_down_proj": 49.639591217041016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1318264752626419, "geo/layer_21/attn_entropy_mean": 5.858343124389648, "geo/layer_21/attn_entropy_std": 0.31926921010017395, "geo/layer_27/stable_rank_q_proj": 43.82905197143555, "geo/layer_27/stable_rank_k_proj": 30.676870346069336, "geo/layer_27/stable_rank_o_proj": 108.11021423339844, "geo/layer_27/stable_rank_gate_proj": 72.30877685546875, "geo/layer_27/stable_rank_down_proj": 128.16787719726562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10235997289419174, "geo/layer_27/attn_entropy_mean": 4.316331386566162, "geo/layer_27/attn_entropy_std": 0.6560248136520386, "attnres/final_alpha/block_0": 0.2549625635147095, "attnres/block_norm/0": 1.7777079343795776, "attnres/final_alpha/block_1": 0.004060032311826944, "attnres/block_norm/1": 49986.6875, "attnres/final_alpha/block_2": 0.008627692237496376, "attnres/block_norm/2": 29906.8203125, "attnres/final_alpha/block_3": 0.01072939857840538, "attnres/block_norm/3": 69669.859375, "attnres/final_alpha/block_4": 0.012022176757454872, "attnres/block_norm/4": 17060.33984375, "attnres/final_alpha/block_5": 0.6066501140594482, "attnres/block_norm/5": 7193.8232421875, "attnres/final_alpha/block_6": 0.10294800996780396, "attnres/block_norm/6": 46457.5390625, "geo/tier1_time_s": 1.3607985973358154, "geo/step": 6675.0, "geo/rankme_slope": 0.0007957623479079132} {"step": 6680, "timestamp": 1778332937.1855612, "train/loss": 2.3726391077041624, "train/z_loss": 0.0013592667877674104, "train/perplexity": 10.72566114037322, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700693.0293742118, "perf/iters_per_sec": 0.8109536311026629, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.233116126060486, "data/tokens_consumed": 14011072512, "data/tokens_consumed_B": 14.011072512, "train/loss_slope": -8.731991916385258e-06} {"step": 6690, "timestamp": 1778332947.5483646, "train/loss": 2.360768032073975, "train/z_loss": 0.0013477470492944121, "train/perplexity": 10.599088767302568, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025622.663329414, "perf/iters_per_sec": 0.9658921543738432, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353122711181642, "data/tokens_consumed": 14032044032, "data/tokens_consumed_B": 14.032044032, "train/loss_slope": -9.001459561773438e-06} {"step": 6700, "timestamp": 1778332957.8905718, "grad/layer_0/attn": 0.002895134733989835, "grad/layer_0/mlp": 0.0031693631317466497, "grad/layer_0/attn_mlp_ratio": 0.9134752069406427, "grad/layer_4/attn": 0.00242137280292809, "grad/layer_4/mlp": 0.002634805627167225, "grad/layer_4/attn_mlp_ratio": 0.9189948154285348, "grad/layer_8/attn": 0.003207886591553688, "grad/layer_8/mlp": 0.0034320938866585493, "grad/layer_8/attn_mlp_ratio": 0.9346732939201507, "grad/layer_12/attn": 0.006306634750217199, "grad/layer_12/mlp": 0.007814879529178143, "grad/layer_12/attn_mlp_ratio": 0.8070034408041729, "grad/layer_16/attn": 0.004211242776364088, "grad/layer_16/mlp": 0.004912885371595621, "grad/layer_16/attn_mlp_ratio": 0.857183176915449, "grad/layer_20/attn": 0.003312997054308653, "grad/layer_20/mlp": 0.006628029048442841, "grad/layer_20/attn_mlp_ratio": 0.4998464822815382, "grad/layer_24/attn": 0.0098988963291049, "grad/layer_24/mlp": 0.012170436792075634, "grad/layer_24/attn_mlp_ratio": 0.8133558734896551, "grad/layer_27/attn": 0.008348413743078709, "grad/layer_27/mlp": 0.012588286772370338, "grad/layer_27/attn_mlp_ratio": 0.663189028636009} {"step": 6700, "timestamp": 1778332957.906795, "train/loss": 2.346125769615173, "train/z_loss": 0.001356045692227781, "train/perplexity": 10.445024802795846, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025814.215595113, "perf/iters_per_sec": 0.9659834936118665, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035214376449585, "data/tokens_consumed": 14053015552, "data/tokens_consumed_B": 14.053015552, "train/loss_slope": -9.391683062883318e-06} {"step": 6710, "timestamp": 1778332968.2711525, "train/loss": 2.3304906845092774, "train/z_loss": 0.0013542361208237707, "train/perplexity": 10.282985997259981, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024273.6337506822, "perf/iters_per_sec": 0.9652488869431888, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036002230644226, "data/tokens_consumed": 14073987072, "data/tokens_consumed_B": 14.073987072, "train/loss_slope": -1.0770594133044578e-05} {"step": 6720, "timestamp": 1778332978.6417615, "train/loss": 2.352753758430481, "train/z_loss": 0.001361326454207301, "train/perplexity": 10.514484244284423, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023735.57717313, "perf/iters_per_sec": 0.9649923215737962, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036277675628662, "data/tokens_consumed": 14094958592, "data/tokens_consumed_B": 14.094958592, "train/loss_slope": -9.385520718743607e-06} {"step": 6730, "timestamp": 1778332989.0030851, "train/loss": 2.3673509836196898, "train/z_loss": 0.001355414732825011, "train/perplexity": 10.669092216943724, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025401.57886811, "perf/iters_per_sec": 0.9657867330875921, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354252815246583, "data/tokens_consumed": 14115930112, "data/tokens_consumed_B": 14.115930112, "train/loss_slope": -1.0643200769890821e-05} {"step": 6740, "timestamp": 1778332999.3499248, "train/loss": 2.328762435913086, "train/z_loss": 0.0013669929234310985, "train/perplexity": 10.265229789136026, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028136.8553203663, "perf/iters_per_sec": 0.967091014537986, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340288400650024, "data/tokens_consumed": 14136901632, "data/tokens_consumed_B": 14.136901632, "train/loss_slope": -1.1782240445571661e-05} {"step": 6750, "timestamp": 1778333009.703223, "grad/layer_0/attn": 0.002640754682943225, "grad/layer_0/mlp": 0.00290025700815022, "grad/layer_0/attn_mlp_ratio": 0.9105243378327568, "grad/layer_4/attn": 0.002075956901535392, "grad/layer_4/mlp": 0.0026849363930523396, "grad/layer_4/attn_mlp_ratio": 0.7731865937638487, "grad/layer_8/attn": 0.0036362421233206987, "grad/layer_8/mlp": 0.003326533595100045, "grad/layer_8/attn_mlp_ratio": 1.093102447354389, "grad/layer_12/attn": 0.0073341322131454945, "grad/layer_12/mlp": 0.006939827464520931, "grad/layer_12/attn_mlp_ratio": 1.056817643516159, "grad/layer_16/attn": 0.004302069544792175, "grad/layer_16/mlp": 0.00457347184419632, "grad/layer_16/attn_mlp_ratio": 0.9406572506149184, "grad/layer_20/attn": 0.005259011872112751, "grad/layer_20/mlp": 0.005716681946069002, "grad/layer_20/attn_mlp_ratio": 0.9199412928219506, "grad/layer_24/attn": 0.00783973466604948, "grad/layer_24/mlp": 0.008817783556878567, "grad/layer_24/attn_mlp_ratio": 0.8890822196498175, "grad/layer_27/attn": 0.005362335126847029, "grad/layer_27/mlp": 0.007918640039861202, "grad/layer_27/attn_mlp_ratio": 0.6771787872836736} {"step": 6750, "timestamp": 1778333010.2971542, "eos/sharpness": 42.062592506408684, "eos/L0_probe": 2.3380258083343506, "eos/L_plus": 2.5890424251556396, "eos/L_minus": 2.5076351165771484, "eos/grad_norm": 0.11837875843048096, "eos/embed_grad_frac": 0.15411870181560516, "eos/time_s": 0.5911023616790771} {"step": 6750, "timestamp": 1778333010.316178, "train/loss": 2.3561472654342652, "train/z_loss": 0.001367458258755505, "train/perplexity": 10.550225830570602, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913651.8803830545, "perf/iters_per_sec": 0.9125003244319222, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0958900213241578, "data/tokens_consumed": 14157873152, "data/tokens_consumed_B": 14.157873152, "train/loss_slope": -1.1305387850606037e-05} {"step": 6750, "timestamp": 1778333011.677336, "geo/rankme_last": 428.4487609863281, "geo/layer_0/stable_rank_q_proj": 20.728538513183594, "geo/layer_0/stable_rank_k_proj": 17.18172264099121, "geo/layer_0/stable_rank_o_proj": 44.92518997192383, "geo/layer_0/stable_rank_gate_proj": 128.1076202392578, "geo/layer_0/stable_rank_down_proj": 56.59307861328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0684414729475975, "geo/layer_0/attn_entropy_mean": 6.247128963470459, "geo/layer_0/attn_entropy_std": 0.4369157552719116, "geo/layer_7/stable_rank_q_proj": 42.127410888671875, "geo/layer_7/stable_rank_k_proj": 39.15924072265625, "geo/layer_7/stable_rank_o_proj": 89.63024139404297, "geo/layer_7/stable_rank_gate_proj": 79.09199523925781, "geo/layer_7/stable_rank_down_proj": 144.23995971679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41780656576156616, "geo/layer_7/attn_entropy_mean": 4.731509685516357, "geo/layer_7/attn_entropy_std": 0.779338538646698, "geo/layer_14/stable_rank_q_proj": 51.88026428222656, "geo/layer_14/stable_rank_k_proj": 42.26682662963867, "geo/layer_14/stable_rank_o_proj": 42.55877685546875, "geo/layer_14/stable_rank_gate_proj": 72.05274963378906, "geo/layer_14/stable_rank_down_proj": 126.99594116210938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37531614303588867, "geo/layer_14/attn_entropy_mean": 5.522902488708496, "geo/layer_14/attn_entropy_std": 0.4291558861732483, "geo/layer_21/stable_rank_q_proj": 39.40981674194336, "geo/layer_21/stable_rank_k_proj": 28.99706268310547, "geo/layer_21/stable_rank_o_proj": 66.09876251220703, "geo/layer_21/stable_rank_gate_proj": 61.815093994140625, "geo/layer_21/stable_rank_down_proj": 49.61317825317383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13809750974178314, "geo/layer_21/attn_entropy_mean": 5.865711212158203, "geo/layer_21/attn_entropy_std": 0.3190765380859375, "geo/layer_27/stable_rank_q_proj": 43.82386016845703, "geo/layer_27/stable_rank_k_proj": 30.592912673950195, "geo/layer_27/stable_rank_o_proj": 108.173583984375, "geo/layer_27/stable_rank_gate_proj": 72.33891296386719, "geo/layer_27/stable_rank_down_proj": 128.09783935546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10033220797777176, "geo/layer_27/attn_entropy_mean": 4.31210994720459, "geo/layer_27/attn_entropy_std": 0.6705725789070129, "attnres/final_alpha/block_0": 0.2519835829734802, "attnres/block_norm/0": 1.7775115966796875, "attnres/final_alpha/block_1": 0.003966991789638996, "attnres/block_norm/1": 49993.33984375, "attnres/final_alpha/block_2": 0.008490553125739098, "attnres/block_norm/2": 29892.60546875, "attnres/final_alpha/block_3": 0.010417542420327663, "attnres/block_norm/3": 69793.53125, "attnres/final_alpha/block_4": 0.011888264678418636, "attnres/block_norm/4": 16939.228515625, "attnres/final_alpha/block_5": 0.6133103966712952, "attnres/block_norm/5": 7006.583984375, "attnres/final_alpha/block_6": 0.09994267672300339, "attnres/block_norm/6": 46169.22265625, "geo/tier1_time_s": 1.3572423458099365, "geo/step": 6750.0, "geo/rankme_slope": 0.000780932255714786} {"step": 6760, "timestamp": 1778333022.0206242, "train/loss": 2.356243538856506, "train/z_loss": 0.0013659535674378277, "train/perplexity": 10.551241585811058, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792309.6957382704, "perf/iters_per_sec": 0.8546398619357445, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.170083498954773, "data/tokens_consumed": 14178844672, "data/tokens_consumed_B": 14.178844672, "train/loss_slope": -1.2613976684877052e-05} {"step": 6770, "timestamp": 1778333032.3709846, "train/loss": 2.379512310028076, "train/z_loss": 0.0013531456352211535, "train/perplexity": 10.79963470587525, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027320.9311453863, "perf/iters_per_sec": 0.9667019515730793, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344449996948242, "data/tokens_consumed": 14199816192, "data/tokens_consumed_B": 14.199816192, "train/loss_slope": -1.3137014842841721e-05} {"step": 6780, "timestamp": 1778333042.7200596, "train/loss": 2.377845621109009, "train/z_loss": 0.0013539391220547259, "train/perplexity": 10.781650065944259, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027496.0272142594, "perf/iters_per_sec": 0.9667854438849732, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343556642532348, "data/tokens_consumed": 14220787712, "data/tokens_consumed_B": 14.220787712, "train/loss_slope": -8.722965473389095e-06} {"step": 6790, "timestamp": 1778333053.0894413, "train/loss": 2.361110305786133, "train/z_loss": 0.001359783427324146, "train/perplexity": 10.602717177679768, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023401.6537494448, "perf/iters_per_sec": 0.9648330944773887, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364486932754517, "data/tokens_consumed": 14241759232, "data/tokens_consumed_B": 14.241759232, "train/loss_slope": -8.307539118398548e-06} {"step": 6800, "timestamp": 1778333063.4580388, "grad/layer_0/attn": 0.0028926595114171505, "grad/layer_0/mlp": 0.0032169725745916367, "grad/layer_0/attn_mlp_ratio": 0.8991868455284114, "grad/layer_4/attn": 0.002342113759368658, "grad/layer_4/mlp": 0.002574401907622814, "grad/layer_4/attn_mlp_ratio": 0.9097700174384771, "grad/layer_8/attn": 0.0026840565260499716, "grad/layer_8/mlp": 0.003563936334103346, "grad/layer_8/attn_mlp_ratio": 0.7531157122686606, "grad/layer_12/attn": 0.0053133973851799965, "grad/layer_12/mlp": 0.006550600286573172, "grad/layer_12/attn_mlp_ratio": 0.8111313576799645, "grad/layer_16/attn": 0.007365580648183823, "grad/layer_16/mlp": 0.004345946479588747, "grad/layer_16/attn_mlp_ratio": 1.6948161955733982, "grad/layer_20/attn": 0.004194394685328007, "grad/layer_20/mlp": 0.005805677734315395, "grad/layer_20/attn_mlp_ratio": 0.7224642505197861, "grad/layer_24/attn": 0.012314016930758953, "grad/layer_24/mlp": 0.0105909937992692, "grad/layer_24/attn_mlp_ratio": 1.162687567179946, "grad/layer_27/attn": 0.011516858823597431, "grad/layer_27/mlp": 0.008799528703093529, "grad/layer_27/attn_mlp_ratio": 1.3088040372739742} {"step": 6800, "timestamp": 1778333063.4737146, "train/loss": 2.3477359294891356, "train/z_loss": 0.0013541797758080064, "train/perplexity": 10.461856509848992, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021008.3438795262, "perf/iters_per_sec": 0.9636918754003173, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376760721206666, "data/tokens_consumed": 14262730752, "data/tokens_consumed_B": 14.262730752, "train/loss_slope": -7.340147815021083e-06} {"step": 6810, "timestamp": 1778333073.850511, "train/loss": 2.3557365417480467, "train/z_loss": 0.0013523237663321196, "train/perplexity": 10.54589349268483, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021954.4838878987, "perf/iters_per_sec": 0.9641430301131719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371905088424682, "data/tokens_consumed": 14283702272, "data/tokens_consumed_B": 14.283702272, "train/loss_slope": -6.362789728031359e-06} {"step": 6820, "timestamp": 1778333084.221732, "train/loss": 2.336034393310547, "train/z_loss": 0.0013500512111932039, "train/perplexity": 10.340150181631563, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022997.3968576624, "perf/iters_per_sec": 0.9646403297699272, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036655807495117, "data/tokens_consumed": 14304673792, "data/tokens_consumed_B": 14.304673792, "train/loss_slope": -8.283606413925104e-06} {"step": 6825, "timestamp": 1778333089.9857354, "eos/sharpness": 43.86360645294189, "eos/L0_probe": 2.3412253856658936, "eos/L_plus": 2.5140573978424072, "eos/L_minus": 2.607029438018799, "eos/grad_norm": 0.1314428150653839, "eos/embed_grad_frac": 0.14248482882976532, "eos/time_s": 0.5914251804351807} {"step": 6825, "timestamp": 1778333091.378451, "geo/rankme_last": 428.1140441894531, "geo/layer_0/stable_rank_q_proj": 20.71796417236328, "geo/layer_0/stable_rank_k_proj": 17.18587875366211, "geo/layer_0/stable_rank_o_proj": 44.90095901489258, "geo/layer_0/stable_rank_gate_proj": 128.1981658935547, "geo/layer_0/stable_rank_down_proj": 56.590675354003906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06510958075523376, "geo/layer_0/attn_entropy_mean": 6.245524883270264, "geo/layer_0/attn_entropy_std": 0.44251060485839844, "geo/layer_7/stable_rank_q_proj": 42.119895935058594, "geo/layer_7/stable_rank_k_proj": 39.14809036254883, "geo/layer_7/stable_rank_o_proj": 89.81019592285156, "geo/layer_7/stable_rank_gate_proj": 79.18589782714844, "geo/layer_7/stable_rank_down_proj": 144.5206298828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4124600291252136, "geo/layer_7/attn_entropy_mean": 4.732463836669922, "geo/layer_7/attn_entropy_std": 0.7573784589767456, "geo/layer_14/stable_rank_q_proj": 51.912532806396484, "geo/layer_14/stable_rank_k_proj": 42.29924392700195, "geo/layer_14/stable_rank_o_proj": 42.58984375, "geo/layer_14/stable_rank_gate_proj": 72.02383422851562, "geo/layer_14/stable_rank_down_proj": 127.14921569824219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3703582286834717, "geo/layer_14/attn_entropy_mean": 5.520940780639648, "geo/layer_14/attn_entropy_std": 0.44676679372787476, "geo/layer_21/stable_rank_q_proj": 39.42447280883789, "geo/layer_21/stable_rank_k_proj": 28.925277709960938, "geo/layer_21/stable_rank_o_proj": 66.05140686035156, "geo/layer_21/stable_rank_gate_proj": 61.77578353881836, "geo/layer_21/stable_rank_down_proj": 49.67766189575195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13524509966373444, "geo/layer_21/attn_entropy_mean": 5.844011306762695, "geo/layer_21/attn_entropy_std": 0.317645400762558, "geo/layer_27/stable_rank_q_proj": 43.85066604614258, "geo/layer_27/stable_rank_k_proj": 30.57592010498047, "geo/layer_27/stable_rank_o_proj": 108.11109161376953, "geo/layer_27/stable_rank_gate_proj": 72.33964538574219, "geo/layer_27/stable_rank_down_proj": 128.22433471679688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1109149158000946, "geo/layer_27/attn_entropy_mean": 4.330334663391113, "geo/layer_27/attn_entropy_std": 0.6563525795936584, "attnres/final_alpha/block_0": 0.2545252740383148, "attnres/block_norm/0": 1.7775685787200928, "attnres/final_alpha/block_1": 0.0040230778977274895, "attnres/block_norm/1": 49981.1875, "attnres/final_alpha/block_2": 0.008604870177805424, "attnres/block_norm/2": 29842.28125, "attnres/final_alpha/block_3": 0.010519107803702354, "attnres/block_norm/3": 70246.640625, "attnres/final_alpha/block_4": 0.012048862874507904, "attnres/block_norm/4": 16918.90625, "attnres/final_alpha/block_5": 0.6067336201667786, "attnres/block_norm/5": 7093.24853515625, "attnres/final_alpha/block_6": 0.10354519635438919, "attnres/block_norm/6": 46522.14453125, "geo/tier1_time_s": 1.3646976947784424, "geo/step": 6825.0, "geo/rankme_slope": 0.0007298772438662965} {"step": 6830, "timestamp": 1778333096.5678535, "train/loss": 2.36048903465271, "train/z_loss": 0.0013415576308034361, "train/perplexity": 10.596132061344582, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699399.0443334635, "perf/iters_per_sec": 0.8103366109530752, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340550661087035, "data/tokens_consumed": 14325645312, "data/tokens_consumed_B": 14.325645312, "train/loss_slope": -8.945162232631068e-06} {"step": 6840, "timestamp": 1778333106.9435744, "train/loss": 2.3400394916534424, "train/z_loss": 0.0013709484250284732, "train/perplexity": 10.38164654302383, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022169.5167072862, "perf/iters_per_sec": 0.9642455657516891, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037080216407776, "data/tokens_consumed": 14346616832, "data/tokens_consumed_B": 14.346616832, "train/loss_slope": -8.991854833905666e-06} {"step": 6850, "timestamp": 1778333117.2965074, "grad/layer_0/attn": 0.002785139949992299, "grad/layer_0/mlp": 0.0032424197997897863, "grad/layer_0/attn_mlp_ratio": 0.8589695462246781, "grad/layer_4/attn": 0.0022369890939444304, "grad/layer_4/mlp": 0.002611679956316948, "grad/layer_4/attn_mlp_ratio": 0.8565325942332632, "grad/layer_8/attn": 0.0060849436558783054, "grad/layer_8/mlp": 0.0035585747100412846, "grad/layer_8/attn_mlp_ratio": 1.7099383828347032, "grad/layer_12/attn": 0.007651505991816521, "grad/layer_12/mlp": 0.006857623346149921, "grad/layer_12/attn_mlp_ratio": 1.1157664243160377, "grad/layer_16/attn": 0.004779359791427851, "grad/layer_16/mlp": 0.004250423517078161, "grad/layer_16/attn_mlp_ratio": 1.1244431666115418, "grad/layer_20/attn": 0.0027003330178558826, "grad/layer_20/mlp": 0.005827313754707575, "grad/layer_20/attn_mlp_ratio": 0.46339241118348695, "grad/layer_24/attn": 0.012598930858075619, "grad/layer_24/mlp": 0.009361881762742996, "grad/layer_24/attn_mlp_ratio": 1.3457690497265236, "grad/layer_27/attn": 0.009242291562259197, "grad/layer_27/mlp": 0.010068661533296108, "grad/layer_27/attn_mlp_ratio": 0.9179265227958219} {"step": 6850, "timestamp": 1778333117.3119633, "train/loss": 2.3300408601760862, "train/z_loss": 0.0013538280152715742, "train/perplexity": 10.278361500124198, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023424.6938585753, "perf/iters_per_sec": 0.9648440808575512, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364368915557862, "data/tokens_consumed": 14367588352, "data/tokens_consumed_B": 14.367588352, "train/loss_slope": -9.833610562136569e-06} {"step": 6860, "timestamp": 1778333127.68024, "train/loss": 2.3699865102767945, "train/z_loss": 0.0013672798406332732, "train/perplexity": 10.69724798022343, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023629.6110034226, "perf/iters_per_sec": 0.9649417929665673, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363319396972657, "data/tokens_consumed": 14388559872, "data/tokens_consumed_B": 14.388559872, "train/loss_slope": -9.447048706869609e-06} {"step": 6870, "timestamp": 1778333138.0499084, "train/loss": 2.3468464612960815, "train/z_loss": 0.001361436548177153, "train/perplexity": 10.452555158484515, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023626.7711137834, "perf/iters_per_sec": 0.9649404388016621, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036333394050598, "data/tokens_consumed": 14409531392, "data/tokens_consumed_B": 14.409531392, "train/loss_slope": -8.715897195874996e-06} {"step": 6880, "timestamp": 1778333148.425031, "train/loss": 2.3347395181655886, "train/z_loss": 0.001366195094306022, "train/perplexity": 10.326769643099214, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022847.3133481995, "perf/iters_per_sec": 0.9645687643757818, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367327213287354, "data/tokens_consumed": 14430502912, "data/tokens_consumed_B": 14.430502912, "train/loss_slope": -8.507567771089858e-06} {"step": 6890, "timestamp": 1778333158.79864, "train/loss": 2.359816288948059, "train/z_loss": 0.0013434453518129884, "train/perplexity": 10.589005956311441, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022849.9649714439, "perf/iters_per_sec": 0.9645700287682742, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367313623428345, "data/tokens_consumed": 14451474432, "data/tokens_consumed_B": 14.451474432, "train/loss_slope": -7.672201117130157e-06} {"step": 6900, "timestamp": 1778333169.1536498, "grad/layer_0/attn": 0.003266161773353815, "grad/layer_0/mlp": 0.0034399100113660097, "grad/layer_0/attn_mlp_ratio": 0.9494904423699612, "grad/layer_4/attn": 0.0017886112909764051, "grad/layer_4/mlp": 0.002655349439010024, "grad/layer_4/attn_mlp_ratio": 0.6735878891647692, "grad/layer_8/attn": 0.0050584995187819, "grad/layer_8/mlp": 0.0033907669130712748, "grad/layer_8/attn_mlp_ratio": 1.4918452076717692, "grad/layer_12/attn": 0.010346471332013607, "grad/layer_12/mlp": 0.006900336593389511, "grad/layer_12/attn_mlp_ratio": 1.4994154331520484, "grad/layer_16/attn": 0.003375955857336521, "grad/layer_16/mlp": 0.0045546586625278, "grad/layer_16/attn_mlp_ratio": 0.7412093931407702, "grad/layer_20/attn": 0.0026421004440635443, "grad/layer_20/mlp": 0.005940514151006937, "grad/layer_20/attn_mlp_ratio": 0.44475954983454496, "grad/layer_24/attn": 0.00793316401541233, "grad/layer_24/mlp": 0.009541179053485394, "grad/layer_24/attn_mlp_ratio": 0.8314657850769258, "grad/layer_27/attn": 0.00620760302990675, "grad/layer_27/mlp": 0.007302654441446066, "grad/layer_27/attn_mlp_ratio": 0.8500474717345079} {"step": 6900, "timestamp": 1778333169.764507, "eos/sharpness": 39.47374820709228, "eos/L0_probe": 2.3374500274658203, "eos/L_plus": 2.5747451782226562, "eos/L_minus": 2.4948923587799072, "eos/grad_norm": 0.12718001008033752, "eos/embed_grad_frac": 0.1443837434053421, "eos/time_s": 0.6080079078674316} {"step": 6900, "timestamp": 1778333169.7857108, "train/loss": 2.3729241132736205, "train/z_loss": 0.0013368957675993442, "train/perplexity": 10.728718449188671, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909930.9501271842, "perf/iters_per_sec": 0.910726046622841, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0980250358581543, "data/tokens_consumed": 14472445952, "data/tokens_consumed_B": 14.472445952, "train/loss_slope": -1.0363606751376447e-05} {"step": 6900, "timestamp": 1778333171.1489265, "geo/rankme_last": 427.6800537109375, "geo/layer_0/stable_rank_q_proj": 20.781776428222656, "geo/layer_0/stable_rank_k_proj": 17.243900299072266, "geo/layer_0/stable_rank_o_proj": 44.85795974731445, "geo/layer_0/stable_rank_gate_proj": 128.4774627685547, "geo/layer_0/stable_rank_down_proj": 56.65214538574219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06660135835409164, "geo/layer_0/attn_entropy_mean": 6.248335361480713, "geo/layer_0/attn_entropy_std": 0.4376637041568756, "geo/layer_7/stable_rank_q_proj": 42.119239807128906, "geo/layer_7/stable_rank_k_proj": 39.146644592285156, "geo/layer_7/stable_rank_o_proj": 89.63638305664062, "geo/layer_7/stable_rank_gate_proj": 79.14484405517578, "geo/layer_7/stable_rank_down_proj": 144.05923461914062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40182292461395264, "geo/layer_7/attn_entropy_mean": 4.719026565551758, "geo/layer_7/attn_entropy_std": 0.7705180644989014, "geo/layer_14/stable_rank_q_proj": 51.866153717041016, "geo/layer_14/stable_rank_k_proj": 42.3166389465332, "geo/layer_14/stable_rank_o_proj": 42.574851989746094, "geo/layer_14/stable_rank_gate_proj": 72.03102111816406, "geo/layer_14/stable_rank_down_proj": 126.86833953857422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37014827132225037, "geo/layer_14/attn_entropy_mean": 5.544748783111572, "geo/layer_14/attn_entropy_std": 0.4270501732826233, "geo/layer_21/stable_rank_q_proj": 39.32979965209961, "geo/layer_21/stable_rank_k_proj": 28.845617294311523, "geo/layer_21/stable_rank_o_proj": 65.94861602783203, "geo/layer_21/stable_rank_gate_proj": 61.805259704589844, "geo/layer_21/stable_rank_down_proj": 49.65822982788086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13310959935188293, "geo/layer_21/attn_entropy_mean": 5.870875358581543, "geo/layer_21/attn_entropy_std": 0.3225826025009155, "geo/layer_27/stable_rank_q_proj": 43.862693786621094, "geo/layer_27/stable_rank_k_proj": 30.5003662109375, "geo/layer_27/stable_rank_o_proj": 108.17060852050781, "geo/layer_27/stable_rank_gate_proj": 72.28132629394531, "geo/layer_27/stable_rank_down_proj": 128.13084411621094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10627495497465134, "geo/layer_27/attn_entropy_mean": 4.346318244934082, "geo/layer_27/attn_entropy_std": 0.6600017547607422, "attnres/final_alpha/block_0": 0.25185853242874146, "attnres/block_norm/0": 1.7776234149932861, "attnres/final_alpha/block_1": 0.003940129652619362, "attnres/block_norm/1": 50203.9765625, "attnres/final_alpha/block_2": 0.008507529273629189, "attnres/block_norm/2": 29783.447265625, "attnres/final_alpha/block_3": 0.010373152792453766, "attnres/block_norm/3": 70482.8125, "attnres/final_alpha/block_4": 0.01177997887134552, "attnres/block_norm/4": 16998.99609375, "attnres/final_alpha/block_5": 0.6126920580863953, "attnres/block_norm/5": 6986.228515625, "attnres/final_alpha/block_6": 0.10084863007068634, "attnres/block_norm/6": 46844.82421875, "geo/tier1_time_s": 1.3591184616088867, "geo/step": 6900.0, "geo/rankme_slope": 0.0006634142914978492} {"step": 6910, "timestamp": 1778333181.5238302, "train/loss": 2.374607706069946, "train/z_loss": 0.0013567403075285256, "train/perplexity": 10.74679645601441, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787204.7655623995, "perf/iters_per_sec": 0.8522056415378568, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1734256982803344, "data/tokens_consumed": 14493417472, "data/tokens_consumed_B": 14.493417472, "train/loss_slope": -7.888119091259415e-06} {"step": 6920, "timestamp": 1778333191.89114, "train/loss": 2.3822062253952025, "train/z_loss": 0.0013531530392356216, "train/perplexity": 10.82876723042822, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023841.275048855, "perf/iters_per_sec": 0.9650427222484851, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036223554611206, "data/tokens_consumed": 14514388992, "data/tokens_consumed_B": 14.514388992, "train/loss_slope": -6.16470643646396e-06} {"step": 6930, "timestamp": 1778333202.2680788, "train/loss": 2.3444833993911742, "train/z_loss": 0.001361374673433602, "train/perplexity": 10.427884284462145, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022162.4039955446, "perf/iters_per_sec": 0.9642421741464351, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370838642120361, "data/tokens_consumed": 14535360512, "data/tokens_consumed_B": 14.535360512, "train/loss_slope": -8.24288244616546e-06} {"step": 6940, "timestamp": 1778333212.6418993, "train/loss": 2.357696843147278, "train/z_loss": 0.0013482297188602387, "train/perplexity": 10.56658689848417, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022954.081568666, "perf/iters_per_sec": 0.9646196754306154, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366780042648316, "data/tokens_consumed": 14556332032, "data/tokens_consumed_B": 14.556332032, "train/loss_slope": -6.321171529651432e-06} {"step": 6950, "timestamp": 1778333223.0062203, "grad/layer_0/attn": 0.0029027750715613365, "grad/layer_0/mlp": 0.003083040937781334, "grad/layer_0/attn_mlp_ratio": 0.9415298194182575, "grad/layer_4/attn": 0.001689695636741817, "grad/layer_4/mlp": 0.0026607487816363573, "grad/layer_4/attn_mlp_ratio": 0.6350451365040718, "grad/layer_8/attn": 0.005176232662051916, "grad/layer_8/mlp": 0.003535559633746743, "grad/layer_8/attn_mlp_ratio": 1.4640489913506567, "grad/layer_12/attn": 0.007580292411148548, "grad/layer_12/mlp": 0.007415824569761753, "grad/layer_12/attn_mlp_ratio": 1.0221779436152822, "grad/layer_16/attn": 0.003437813837081194, "grad/layer_16/mlp": 0.004694977309554815, "grad/layer_16/attn_mlp_ratio": 0.7322322424991546, "grad/layer_20/attn": 0.0029143572319298983, "grad/layer_20/mlp": 0.006558744236826897, "grad/layer_20/attn_mlp_ratio": 0.4443468265054919, "grad/layer_24/attn": 0.012775666080415249, "grad/layer_24/mlp": 0.009350186213850975, "grad/layer_24/attn_mlp_ratio": 1.3663541721612447, "grad/layer_27/attn": 0.003930849488824606, "grad/layer_27/mlp": 0.00995826255530119, "grad/layer_27/attn_mlp_ratio": 0.39473245734606677} {"step": 6950, "timestamp": 1778333223.0221765, "train/loss": 2.3769896268844604, "train/z_loss": 0.0013610589667223394, "train/perplexity": 10.77242498462816, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021247.4665066109, "perf/iters_per_sec": 0.9638058979542784, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037553310394287, "data/tokens_consumed": 14577303552, "data/tokens_consumed_B": 14.577303552, "train/loss_slope": -5.436970927927744e-06} {"step": 6960, "timestamp": 1778333233.3980033, "train/loss": 2.3778875350952147, "train/z_loss": 0.0013477726257406174, "train/perplexity": 10.782101977347038, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022598.7429134487, "perf/iters_per_sec": 0.9644502367560619, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368601322174071, "data/tokens_consumed": 14598275072, "data/tokens_consumed_B": 14.598275072, "train/loss_slope": -5.046413039932217e-06} {"step": 6970, "timestamp": 1778333243.779652, "train/loss": 2.3661783933639526, "train/z_loss": 0.0013522956287488341, "train/perplexity": 10.6565890753362, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021398.0094973908, "perf/iters_per_sec": 0.9638776824461893, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374760389328004, "data/tokens_consumed": 14619246592, "data/tokens_consumed_B": 14.619246592, "train/loss_slope": -3.539212630598362e-06} {"step": 6975, "timestamp": 1778333249.5578504, "eos/sharpness": 14.341259002685543, "eos/L0_probe": 2.339350700378418, "eos/L_plus": 2.4192094802856445, "eos/L_minus": 2.402904510498047, "eos/grad_norm": 0.09964291006326675, "eos/embed_grad_frac": 0.23081353306770325, "eos/time_s": 0.60064697265625} {"step": 6975, "timestamp": 1778333250.9363625, "geo/rankme_last": 428.7935485839844, "geo/layer_0/stable_rank_q_proj": 20.766809463500977, "geo/layer_0/stable_rank_k_proj": 17.23954963684082, "geo/layer_0/stable_rank_o_proj": 44.84162139892578, "geo/layer_0/stable_rank_gate_proj": 128.63955688476562, "geo/layer_0/stable_rank_down_proj": 56.74332046508789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0675937756896019, "geo/layer_0/attn_entropy_mean": 6.244147300720215, "geo/layer_0/attn_entropy_std": 0.44216465950012207, "geo/layer_7/stable_rank_q_proj": 42.180118560791016, "geo/layer_7/stable_rank_k_proj": 39.074005126953125, "geo/layer_7/stable_rank_o_proj": 89.715576171875, "geo/layer_7/stable_rank_gate_proj": 79.07676696777344, "geo/layer_7/stable_rank_down_proj": 144.0300750732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40941476821899414, "geo/layer_7/attn_entropy_mean": 4.719655990600586, "geo/layer_7/attn_entropy_std": 0.7586536407470703, "geo/layer_14/stable_rank_q_proj": 51.887184143066406, "geo/layer_14/stable_rank_k_proj": 42.26954650878906, "geo/layer_14/stable_rank_o_proj": 42.575721740722656, "geo/layer_14/stable_rank_gate_proj": 71.94338989257812, "geo/layer_14/stable_rank_down_proj": 126.88519287109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3687035143375397, "geo/layer_14/attn_entropy_mean": 5.5312604904174805, "geo/layer_14/attn_entropy_std": 0.4405722916126251, "geo/layer_21/stable_rank_q_proj": 39.23194885253906, "geo/layer_21/stable_rank_k_proj": 28.85273551940918, "geo/layer_21/stable_rank_o_proj": 65.90007019042969, "geo/layer_21/stable_rank_gate_proj": 61.8020133972168, "geo/layer_21/stable_rank_down_proj": 49.69615936279297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13286824524402618, "geo/layer_21/attn_entropy_mean": 5.870477199554443, "geo/layer_21/attn_entropy_std": 0.31427693367004395, "geo/layer_27/stable_rank_q_proj": 43.87400817871094, "geo/layer_27/stable_rank_k_proj": 30.495882034301758, "geo/layer_27/stable_rank_o_proj": 108.12447357177734, "geo/layer_27/stable_rank_gate_proj": 72.20802307128906, "geo/layer_27/stable_rank_down_proj": 128.04434204101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09653675556182861, "geo/layer_27/attn_entropy_mean": 4.339122772216797, "geo/layer_27/attn_entropy_std": 0.6681208610534668, "attnres/final_alpha/block_0": 0.2539435029029846, "attnres/block_norm/0": 1.7775795459747314, "attnres/final_alpha/block_1": 0.003965703770518303, "attnres/block_norm/1": 50060.203125, "attnres/final_alpha/block_2": 0.008667578920722008, "attnres/block_norm/2": 29776.212890625, "attnres/final_alpha/block_3": 0.01057724840939045, "attnres/block_norm/3": 70041.1015625, "attnres/final_alpha/block_4": 0.012102010659873486, "attnres/block_norm/4": 17011.595703125, "attnres/final_alpha/block_5": 0.6102367639541626, "attnres/block_norm/5": 7068.8623046875, "attnres/final_alpha/block_6": 0.10050718486309052, "attnres/block_norm/6": 46719.88671875, "geo/tier1_time_s": 1.357696771621704, "geo/step": 6975.0, "geo/rankme_slope": 0.0006240462982067827} {"step": 6980, "timestamp": 1778333256.1244085, "train/loss": 2.382738709449768, "train/z_loss": 0.0013452980667352677, "train/perplexity": 10.834534911771916, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699451.1507454033, "perf/iters_per_sec": 0.8103614572264687, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340172290802003, "data/tokens_consumed": 14640218112, "data/tokens_consumed_B": 14.640218112, "train/loss_slope": -3.0488131820994384e-06} {"step": 6990, "timestamp": 1778333266.499145, "train/loss": 2.3936763525009157, "train/z_loss": 0.001349313254468143, "train/perplexity": 10.953689635287772, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022560.0023141005, "perf/iters_per_sec": 0.9644317637987616, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368799924850465, "data/tokens_consumed": 14661189632, "data/tokens_consumed_B": 14.661189632, "train/loss_slope": 1.5063801292467954e-06} {"step": 7000, "timestamp": 1778333276.8648558, "grad/layer_0/attn": 0.002869165502488613, "grad/layer_0/mlp": 0.0031936645973473787, "grad/layer_0/attn_mlp_ratio": 0.8983928415752941, "grad/layer_4/attn": 0.0022123323287814856, "grad/layer_4/mlp": 0.0026828069239854813, "grad/layer_4/attn_mlp_ratio": 0.8246334190280017, "grad/layer_8/attn": 0.0028460121247917414, "grad/layer_8/mlp": 0.003466275753453374, "grad/layer_8/attn_mlp_ratio": 0.8210575975816589, "grad/layer_12/attn": 0.005763741675764322, "grad/layer_12/mlp": 0.006514411419630051, "grad/layer_12/attn_mlp_ratio": 0.8847678195330899, "grad/layer_16/attn": 0.0037876649294048548, "grad/layer_16/mlp": 0.00439411960542202, "grad/layer_16/attn_mlp_ratio": 0.8619849215148043, "grad/layer_20/attn": 0.0030880910344421864, "grad/layer_20/mlp": 0.006216293666511774, "grad/layer_20/attn_mlp_ratio": 0.4967736645713649, "grad/layer_24/attn": 0.006872572470456362, "grad/layer_24/mlp": 0.008365912362933159, "grad/layer_24/attn_mlp_ratio": 0.8214970573630388, "grad/layer_27/attn": 0.006191540043801069, "grad/layer_27/mlp": 0.008653460070490837, "grad/layer_27/attn_mlp_ratio": 0.7154987625545258} {"step": 7000, "timestamp": 1778333276.8807, "train/loss": 2.3326555490493774, "train/z_loss": 0.0013680594274774194, "train/perplexity": 10.305271382728549, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021040.8025367975, "perf/iters_per_sec": 0.9637073528942096, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376594066619873, "data/tokens_consumed": 14682161152, "data/tokens_consumed_B": 14.682161152, "train/loss_slope": 1.6589570753645623e-06} {"step": 7000, "timestamp": 1778333283.8927896, "geo/ww_alpha_mean": 7.810635594142485, "geo/ww_alpha_std": 5.118977635740217, "geo/ww_alpha_min": 1.3374745812610989, "geo/ww_alpha_max": 36.0065441053125, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.887077652329734, "geo/ww_alpha_by_type/k_proj": 4.313644978441097, "geo/ww_alpha_by_type/v_proj": 9.388584992987818, "geo/ww_alpha_by_type/o_proj": 8.856249119363103, "geo/ww_alpha_by_type/gate_proj": 7.776165439927624, "geo/ww_alpha_by_type/up_proj": 12.479639174758123, "geo/ww_alpha_by_type/down_proj": 8.077519041328369, "geo/twonn_id/layer_0": 0.7500061392784119, "geo/twonn_id/layer_7": 3.6268110275268555, "geo/twonn_id/layer_14": 5.778730869293213, "geo/twonn_id/layer_21": 8.892243385314941, "geo/twonn_id/layer_27": 6.457676887512207, "geo/tier2_time_s": 7.0039098262786865} {"step": 7000, "timestamp": 1778333284.6451807, "eoc/jacobian_sigma/layer_0/attn": 1672.2855224609375, "eoc/jacobian_sigma/layer_0/mlp": 10609.6396484375, "eoc/jacobian_sigma/layer_0": 10609.6396484375, "eoc/jacobian_sigma/layer_7/attn": 1.1427439451217651, "eoc/jacobian_sigma/layer_7/mlp": 1.8264111280441284, "eoc/jacobian_sigma/layer_7": 1.8264111280441284, "eoc/jacobian_sigma/layer_14/attn": 1.899344563484192, "eoc/jacobian_sigma/layer_14/mlp": 10.89072322845459, "eoc/jacobian_sigma/layer_14": 10.89072322845459, "eoc/jacobian_sigma/layer_21/attn": 1.100719690322876, "eoc/jacobian_sigma/layer_21/mlp": 4.79263162612915, "eoc/jacobian_sigma/layer_21": 4.79263162612915, "eoc/jacobian_sigma/layer_27/attn": 3.748124122619629, "eoc/jacobian_sigma/layer_27/mlp": 34.71455001831055, "eoc/jacobian_sigma/layer_27": 34.71455001831055, "eoc/layer0_sigma": 10609.6396484375, "eoc/sigma_max": 34.71455001831055, "eoc/sigma_min": 1.8264111280441284, "eoc/sigma_mean": 13.056079000234604, "eoc/time_s": 0.7461047172546387} {"step": 7010, "timestamp": 1778333295.033475, "train/loss": 2.311532974243164, "train/z_loss": 0.0013560089166276157, "train/perplexity": 10.089880332075092, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1155711.5396324587, "perf/iters_per_sec": 0.5510862062608999, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8145981311798096, "data/tokens_consumed": 14703132672, "data/tokens_consumed_B": 14.703132672, "train/loss_slope": -1.6747322019570393e-06} {"step": 7020, "timestamp": 1778333305.4035108, "train/loss": 2.3695094347000123, "train/z_loss": 0.0013612239505164324, "train/perplexity": 10.692145801632453, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023577.9355701993, "perf/iters_per_sec": 0.9649171521998402, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036358404159546, "data/tokens_consumed": 14724104192, "data/tokens_consumed_B": 14.724104192, "train/loss_slope": -1.4808565512312486e-06} {"step": 7030, "timestamp": 1778333316.4472866, "train/loss": 2.31940438747406, "train/z_loss": 0.0013651246554218233, "train/perplexity": 10.169615351570469, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1899999.484225149, "perf/iters_per_sec": 0.9059903546453233, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1037645101547242, "data/tokens_consumed": 14745075712, "data/tokens_consumed_B": 14.745075712, "train/loss_slope": -2.551668283283367e-06} {"step": 7040, "timestamp": 1778333326.81778, "train/loss": 2.317481827735901, "train/z_loss": 0.001357231824658811, "train/perplexity": 10.150082441153065, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023618.949819589, "perf/iters_per_sec": 0.964936709317965, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036337399482727, "data/tokens_consumed": 14766047232, "data/tokens_consumed_B": 14.766047232, "train/loss_slope": -4.0931276326560974e-06} {"step": 7050, "timestamp": 1778333337.1721656, "grad/layer_0/attn": 0.0029157474637031555, "grad/layer_0/mlp": 0.0032614388037472963, "grad/layer_0/attn_mlp_ratio": 0.8940064645555826, "grad/layer_4/attn": 0.00216837995685637, "grad/layer_4/mlp": 0.0027032496873289347, "grad/layer_4/attn_mlp_ratio": 0.8021382141673757, "grad/layer_8/attn": 0.004595227539539337, "grad/layer_8/mlp": 0.0033936393447220325, "grad/layer_8/attn_mlp_ratio": 1.3540706413835677, "grad/layer_12/attn": 0.010942826978862286, "grad/layer_12/mlp": 0.007383744698017836, "grad/layer_12/attn_mlp_ratio": 1.4820158711064724, "grad/layer_16/attn": 0.004789363592863083, "grad/layer_16/mlp": 0.005427221301943064, "grad/layer_16/attn_mlp_ratio": 0.8824706490043664, "grad/layer_20/attn": 0.00672548171132803, "grad/layer_20/mlp": 0.007798558101058006, "grad/layer_20/attn_mlp_ratio": 0.8624006563694818, "grad/layer_24/attn": 0.01845777966082096, "grad/layer_24/mlp": 0.0159777719527483, "grad/layer_24/attn_mlp_ratio": 1.1552161089722193, "grad/layer_27/attn": 0.014133811928331852, "grad/layer_27/mlp": 0.014013988897204399, "grad/layer_27/attn_mlp_ratio": 1.0085502372772917} {"step": 7050, "timestamp": 1778333337.817847, "eos/sharpness": 66.2853956222534, "eos/L0_probe": 2.3339967727661133, "eos/L_plus": 2.6217994689941406, "eos/L_minus": 2.70904803276062, "eos/grad_norm": 0.2640264928340912, "eos/embed_grad_frac": 0.028815066441893578, "eos/time_s": 0.6428399085998535} {"step": 7050, "timestamp": 1778333337.8387747, "train/loss": 2.3257726430892944, "train/z_loss": 0.0013661984470672906, "train/perplexity": 10.234584712820194, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903926.0054663052, "perf/iters_per_sec": 0.9078626658755804, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.101488184928894, "data/tokens_consumed": 14787018752, "data/tokens_consumed_B": 14.787018752, "train/loss_slope": -4.633195887375849e-06} {"step": 7050, "timestamp": 1778333339.2038562, "geo/rankme_last": 427.70166015625, "geo/layer_0/stable_rank_q_proj": 20.76347541809082, "geo/layer_0/stable_rank_k_proj": 17.25986671447754, "geo/layer_0/stable_rank_o_proj": 44.87565612792969, "geo/layer_0/stable_rank_gate_proj": 128.5678253173828, "geo/layer_0/stable_rank_down_proj": 56.76190185546875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06877975165843964, "geo/layer_0/attn_entropy_mean": 6.246611595153809, "geo/layer_0/attn_entropy_std": 0.4396458864212036, "geo/layer_7/stable_rank_q_proj": 42.17339324951172, "geo/layer_7/stable_rank_k_proj": 39.044166564941406, "geo/layer_7/stable_rank_o_proj": 89.63774108886719, "geo/layer_7/stable_rank_gate_proj": 79.00462341308594, "geo/layer_7/stable_rank_down_proj": 144.15655517578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4111180603504181, "geo/layer_7/attn_entropy_mean": 4.733847141265869, "geo/layer_7/attn_entropy_std": 0.7572136521339417, "geo/layer_14/stable_rank_q_proj": 51.845394134521484, "geo/layer_14/stable_rank_k_proj": 42.22865676879883, "geo/layer_14/stable_rank_o_proj": 42.54977035522461, "geo/layer_14/stable_rank_gate_proj": 71.90274810791016, "geo/layer_14/stable_rank_down_proj": 126.769287109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3713296353816986, "geo/layer_14/attn_entropy_mean": 5.512178421020508, "geo/layer_14/attn_entropy_std": 0.44446370005607605, "geo/layer_21/stable_rank_q_proj": 39.31196975708008, "geo/layer_21/stable_rank_k_proj": 28.89363670349121, "geo/layer_21/stable_rank_o_proj": 65.90966033935547, "geo/layer_21/stable_rank_gate_proj": 61.83420944213867, "geo/layer_21/stable_rank_down_proj": 49.6590461730957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13573376834392548, "geo/layer_21/attn_entropy_mean": 5.869749069213867, "geo/layer_21/attn_entropy_std": 0.3102914094924927, "geo/layer_27/stable_rank_q_proj": 43.926612854003906, "geo/layer_27/stable_rank_k_proj": 30.398094177246094, "geo/layer_27/stable_rank_o_proj": 108.16940307617188, "geo/layer_27/stable_rank_gate_proj": 72.10751342773438, "geo/layer_27/stable_rank_down_proj": 128.11111450195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10176306962966919, "geo/layer_27/attn_entropy_mean": 4.3359375, "geo/layer_27/attn_entropy_std": 0.6669307947158813, "attnres/final_alpha/block_0": 0.2549123466014862, "attnres/block_norm/0": 1.77774977684021, "attnres/final_alpha/block_1": 0.003992751706391573, "attnres/block_norm/1": 49981.25, "attnres/final_alpha/block_2": 0.008662930689752102, "attnres/block_norm/2": 29871.71875, "attnres/final_alpha/block_3": 0.010684234090149403, "attnres/block_norm/3": 69700.5546875, "attnres/final_alpha/block_4": 0.01210920698940754, "attnres/block_norm/4": 17036.07421875, "attnres/final_alpha/block_5": 0.6063220500946045, "attnres/block_norm/5": 7096.08056640625, "attnres/final_alpha/block_6": 0.10331648588180542, "attnres/block_norm/6": 46690.78515625, "geo/tier1_time_s": 1.360826015472412, "geo/step": 7050.0, "geo/rankme_slope": 0.0006113975863783013} {"step": 7060, "timestamp": 1778333349.5818946, "train/loss": 2.3620614767074586, "train/z_loss": 0.0013495968654751779, "train/perplexity": 10.612806971744869, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786455.6577104295, "perf/iters_per_sec": 0.8518484390785358, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1739177465438844, "data/tokens_consumed": 14807990272, "data/tokens_consumed_B": 14.807990272, "train/loss_slope": -4.532750604963075e-06} {"step": 7070, "timestamp": 1778333359.9537182, "train/loss": 2.3256587982177734, "train/z_loss": 0.0013596200267784297, "train/perplexity": 10.233419624159435, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023715.090756542, "perf/iters_per_sec": 0.9649825528891287, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362881660461425, "data/tokens_consumed": 14828961792, "data/tokens_consumed_B": 14.828961792, "train/loss_slope": -6.28685205861888e-06} {"step": 7080, "timestamp": 1778333370.326638, "train/loss": 2.346037983894348, "train/z_loss": 0.0013478384236805141, "train/perplexity": 10.444107919009738, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022799.3992487297, "perf/iters_per_sec": 0.9645459171527527, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367572784423829, "data/tokens_consumed": 14849933312, "data/tokens_consumed_B": 14.849933312, "train/loss_slope": -9.938310053196119e-06} {"step": 7090, "timestamp": 1778333380.6974387, "train/loss": 2.401604986190796, "train/z_loss": 0.0013496183906681835, "train/perplexity": 11.04088263185467, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023312.6633462086, "perf/iters_per_sec": 0.9647906605464023, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364942789077758, "data/tokens_consumed": 14870904832, "data/tokens_consumed_B": 14.870904832, "train/loss_slope": -6.089245103480623e-06} {"step": 7100, "timestamp": 1778333391.056352, "grad/layer_0/attn": 0.003217951161786914, "grad/layer_0/mlp": 0.0034705738071352243, "grad/layer_0/attn_mlp_ratio": 0.9272100948984436, "grad/layer_4/attn": 0.0022386140190064907, "grad/layer_4/mlp": 0.002550791949033737, "grad/layer_4/attn_mlp_ratio": 0.8776152567413317, "grad/layer_8/attn": 0.003726667957380414, "grad/layer_8/mlp": 0.0034724611323326826, "grad/layer_8/attn_mlp_ratio": 1.0732064976509368, "grad/layer_12/attn": 0.006471563130617142, "grad/layer_12/mlp": 0.00721561536192894, "grad/layer_12/attn_mlp_ratio": 0.8968830399516757, "grad/layer_16/attn": 0.004142104182392359, "grad/layer_16/mlp": 0.004384842701256275, "grad/layer_16/attn_mlp_ratio": 0.9446414318902446, "grad/layer_20/attn": 0.002661575097590685, "grad/layer_20/mlp": 0.00571758858859539, "grad/layer_20/attn_mlp_ratio": 0.4655065697362257, "grad/layer_24/attn": 0.005016803275793791, "grad/layer_24/mlp": 0.007682579569518566, "grad/layer_24/attn_mlp_ratio": 0.653010250671201, "grad/layer_27/attn": 0.006444520317018032, "grad/layer_27/mlp": 0.0069170002825558186, "grad/layer_27/attn_mlp_ratio": 0.9316929247641293} {"step": 7100, "timestamp": 1778333391.071987, "train/loss": 2.3468084812164305, "train/z_loss": 0.001352973806206137, "train/perplexity": 10.452158177145778, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022819.8671300123, "perf/iters_per_sec": 0.9645556769990979, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367467880249024, "data/tokens_consumed": 14891876352, "data/tokens_consumed_B": 14.891876352, "train/loss_slope": -7.392047998821431e-06} {"step": 7110, "timestamp": 1778333401.442605, "train/loss": 2.3697184324264526, "train/z_loss": 0.0013689094805158675, "train/perplexity": 10.694380669328762, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023131.0764638628, "perf/iters_per_sec": 0.9647040731734575, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365873098373413, "data/tokens_consumed": 14912847872, "data/tokens_consumed_B": 14.912847872, "train/loss_slope": -6.668135313668626e-06} {"step": 7120, "timestamp": 1778333411.8143148, "train/loss": 2.3498887062072753, "train/z_loss": 0.0013418312184512616, "train/perplexity": 10.48440281084037, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023208.3236909553, "perf/iters_per_sec": 0.9647409075217034, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365477323532104, "data/tokens_consumed": 14933819392, "data/tokens_consumed_B": 14.933819392, "train/loss_slope": -4.211332116297201e-06} {"step": 7125, "timestamp": 1778333417.5868208, "eos/sharpness": 40.56775569915771, "eos/L0_probe": 2.3322227001190186, "eos/L_plus": 2.528552293777466, "eos/L_minus": 2.5415706634521484, "eos/grad_norm": 0.12975847721099854, "eos/embed_grad_frac": 0.13770641386508942, "eos/time_s": 0.5966053009033203} {"step": 7125, "timestamp": 1778333418.9666407, "geo/rankme_last": 429.4722900390625, "geo/layer_0/stable_rank_q_proj": 20.73653221130371, "geo/layer_0/stable_rank_k_proj": 17.21713638305664, "geo/layer_0/stable_rank_o_proj": 44.94275665283203, "geo/layer_0/stable_rank_gate_proj": 128.4151611328125, "geo/layer_0/stable_rank_down_proj": 56.70622253417969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06193532049655914, "geo/layer_0/attn_entropy_mean": 6.246031761169434, "geo/layer_0/attn_entropy_std": 0.44368794560432434, "geo/layer_7/stable_rank_q_proj": 42.06585693359375, "geo/layer_7/stable_rank_k_proj": 38.867549896240234, "geo/layer_7/stable_rank_o_proj": 89.74920654296875, "geo/layer_7/stable_rank_gate_proj": 79.00336456298828, "geo/layer_7/stable_rank_down_proj": 144.38510131835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4215514063835144, "geo/layer_7/attn_entropy_mean": 4.739243984222412, "geo/layer_7/attn_entropy_std": 0.7827761769294739, "geo/layer_14/stable_rank_q_proj": 51.83592224121094, "geo/layer_14/stable_rank_k_proj": 42.191314697265625, "geo/layer_14/stable_rank_o_proj": 42.54564666748047, "geo/layer_14/stable_rank_gate_proj": 72.02264404296875, "geo/layer_14/stable_rank_down_proj": 126.93025207519531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3705260455608368, "geo/layer_14/attn_entropy_mean": 5.549330711364746, "geo/layer_14/attn_entropy_std": 0.42969444394111633, "geo/layer_21/stable_rank_q_proj": 39.42428207397461, "geo/layer_21/stable_rank_k_proj": 28.928903579711914, "geo/layer_21/stable_rank_o_proj": 65.84558868408203, "geo/layer_21/stable_rank_gate_proj": 61.839271545410156, "geo/layer_21/stable_rank_down_proj": 49.75859069824219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13905175030231476, "geo/layer_21/attn_entropy_mean": 5.853086471557617, "geo/layer_21/attn_entropy_std": 0.3146945536136627, "geo/layer_27/stable_rank_q_proj": 44.04658508300781, "geo/layer_27/stable_rank_k_proj": 30.424379348754883, "geo/layer_27/stable_rank_o_proj": 108.24044799804688, "geo/layer_27/stable_rank_gate_proj": 72.18050384521484, "geo/layer_27/stable_rank_down_proj": 128.0845489501953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09851565957069397, "geo/layer_27/attn_entropy_mean": 4.323922157287598, "geo/layer_27/attn_entropy_std": 0.6735677123069763, "attnres/final_alpha/block_0": 0.2539650797843933, "attnres/block_norm/0": 1.7777223587036133, "attnres/final_alpha/block_1": 0.004030355252325535, "attnres/block_norm/1": 50014.18359375, "attnres/final_alpha/block_2": 0.008687219582498074, "attnres/block_norm/2": 29790.07421875, "attnres/final_alpha/block_3": 0.0105595663189888, "attnres/block_norm/3": 70156.4921875, "attnres/final_alpha/block_4": 0.01209214050322771, "attnres/block_norm/4": 17012.759765625, "attnres/final_alpha/block_5": 0.6080265045166016, "attnres/block_norm/5": 7158.61767578125, "attnres/final_alpha/block_6": 0.10263917595148087, "attnres/block_norm/6": 46806.203125, "geo/tier1_time_s": 1.3593347072601318, "geo/step": 7125.0, "geo/rankme_slope": 0.0006224771353854041} {"step": 7130, "timestamp": 1778333424.1534102, "train/loss": 2.3739808559417725, "train/z_loss": 0.0013375417096540333, "train/perplexity": 10.740061936266308, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700460.6819811417, "perf/iters_per_sec": 0.8108428392320355, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2332846164703368, "data/tokens_consumed": 14954790912, "data/tokens_consumed_B": 14.954790912, "train/loss_slope": -2.964568145275763e-06} {"step": 7140, "timestamp": 1778333434.518522, "train/loss": 2.341366934776306, "train/z_loss": 0.0013570746988989413, "train/perplexity": 10.395436739156896, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024463.160374982, "perf/iters_per_sec": 0.9653392602801237, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359052419662476, "data/tokens_consumed": 14975762432, "data/tokens_consumed_B": 14.975762432, "train/loss_slope": -4.816286019509184e-06} {"step": 7150, "timestamp": 1778333444.889524, "grad/layer_0/attn": 0.002588961971923709, "grad/layer_0/mlp": 0.0029845447279512882, "grad/layer_0/attn_mlp_ratio": 0.8674562190110834, "grad/layer_4/attn": 0.002092689275741577, "grad/layer_4/mlp": 0.0025316302198916674, "grad/layer_4/attn_mlp_ratio": 0.826617243164921, "grad/layer_8/attn": 0.008064583875238895, "grad/layer_8/mlp": 0.003170379437506199, "grad/layer_8/attn_mlp_ratio": 2.5437282129263448, "grad/layer_12/attn": 0.005114132072776556, "grad/layer_12/mlp": 0.006940808147192001, "grad/layer_12/attn_mlp_ratio": 0.7368208270046285, "grad/layer_16/attn": 0.0029451102018356323, "grad/layer_16/mlp": 0.004383290186524391, "grad/layer_16/attn_mlp_ratio": 0.6718948573608791, "grad/layer_20/attn": 0.003560757264494896, "grad/layer_20/mlp": 0.005106204655021429, "grad/layer_20/attn_mlp_ratio": 0.697339302932037, "grad/layer_24/attn": 0.005944518372416496, "grad/layer_24/mlp": 0.008316456340253353, "grad/layer_24/attn_mlp_ratio": 0.7147898164467994, "grad/layer_27/attn": 0.004092519637197256, "grad/layer_27/mlp": 0.0077133639715611935, "grad/layer_27/attn_mlp_ratio": 0.5305751938102056} {"step": 7150, "timestamp": 1778333444.9052286, "train/loss": 2.3619476079940798, "train/z_loss": 0.0013488532276824117, "train/perplexity": 10.61159857387032, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020476.4305183627, "perf/iters_per_sec": 0.9634382393447698, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379492521286011, "data/tokens_consumed": 14996733952, "data/tokens_consumed_B": 14.996733952, "train/loss_slope": -6.3565882125702805e-06} {"step": 7160, "timestamp": 1778333455.2763896, "train/loss": 2.3615793704986574, "train/z_loss": 0.0013556950492784382, "train/perplexity": 10.607691704761045, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023513.740667372, "perf/iters_per_sec": 0.9648865416848049, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036391282081604, "data/tokens_consumed": 15017705472, "data/tokens_consumed_B": 15.017705472, "train/loss_slope": -4.341913993530942e-06} {"step": 7170, "timestamp": 1778333465.642986, "train/loss": 2.346516728401184, "train/z_loss": 0.0013576215947978198, "train/perplexity": 10.449109175371248, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023912.941695288, "perf/iters_per_sec": 0.9650768955685082, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361868619918824, "data/tokens_consumed": 15038676992, "data/tokens_consumed_B": 15.038676992, "train/loss_slope": -3.4864297043335895e-06} {"step": 7180, "timestamp": 1778333476.0085838, "train/loss": 2.3342466354370117, "train/z_loss": 0.0013503820984624327, "train/perplexity": 10.321681010852629, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024589.0651775214, "perf/iters_per_sec": 0.9653992963683707, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358408212661743, "data/tokens_consumed": 15059648512, "data/tokens_consumed_B": 15.059648512, "train/loss_slope": -4.531304070157628e-06} {"step": 7190, "timestamp": 1778333486.3766274, "train/loss": 2.3952560901641844, "train/z_loss": 0.0013434990076348185, "train/perplexity": 10.971007266412006, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024106.2201345952, "perf/iters_per_sec": 0.9651690579102493, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360879182815552, "data/tokens_consumed": 15080620032, "data/tokens_consumed_B": 15.080620032, "train/loss_slope": -3.3543748108788648e-06} {"step": 7200, "timestamp": 1778333496.7501566, "grad/layer_0/attn": 0.002936192322522402, "grad/layer_0/mlp": 0.003358857473358512, "grad/layer_0/attn_mlp_ratio": 0.8741639853417517, "grad/layer_4/attn": 0.0031676802318543196, "grad/layer_4/mlp": 0.002660124097019434, "grad/layer_4/attn_mlp_ratio": 1.1908016307673086, "grad/layer_8/attn": 0.005572878755629063, "grad/layer_8/mlp": 0.00343491043895483, "grad/layer_8/attn_mlp_ratio": 1.6224232603521518, "grad/layer_12/attn": 0.006410089787095785, "grad/layer_12/mlp": 0.007182581815868616, "grad/layer_12/attn_mlp_ratio": 0.892449242094107, "grad/layer_16/attn": 0.006080478895455599, "grad/layer_16/mlp": 0.004838708788156509, "grad/layer_16/attn_mlp_ratio": 1.256632510035583, "grad/layer_20/attn": 0.004658571444451809, "grad/layer_20/mlp": 0.006705714855343103, "grad/layer_20/attn_mlp_ratio": 0.6947165925595848, "grad/layer_24/attn": 0.013333218172192574, "grad/layer_24/mlp": 0.013918915763497353, "grad/layer_24/attn_mlp_ratio": 0.957920739154636, "grad/layer_27/attn": 0.004056941252201796, "grad/layer_27/mlp": 0.012793135829269886, "grad/layer_27/attn_mlp_ratio": 0.3171185919255161} {"step": 7200, "timestamp": 1778333497.339567, "eos/sharpness": 57.21223354339598, "eos/L0_probe": 2.333928108215332, "eos/L_plus": 2.603365659713745, "eos/L_minus": 2.636612892150879, "eos/grad_norm": 0.20899292826652527, "eos/embed_grad_frac": 0.054768264293670654, "eos/time_s": 0.5866279602050781} {"step": 7200, "timestamp": 1778333497.3600879, "train/loss": 2.3107712268829346, "train/z_loss": 0.0013460443238727748, "train/perplexity": 10.08219731899608, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910401.5971379203, "perf/iters_per_sec": 0.9109504686059572, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0977545261383057, "data/tokens_consumed": 15101591552, "data/tokens_consumed_B": 15.101591552, "train/loss_slope": -3.6419646431653148e-06} {"step": 7200, "timestamp": 1778333498.7238977, "geo/rankme_last": 428.33477783203125, "geo/layer_0/stable_rank_q_proj": 20.774988174438477, "geo/layer_0/stable_rank_k_proj": 17.225305557250977, "geo/layer_0/stable_rank_o_proj": 44.94660568237305, "geo/layer_0/stable_rank_gate_proj": 128.3165740966797, "geo/layer_0/stable_rank_down_proj": 56.75587844848633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06642758846282959, "geo/layer_0/attn_entropy_mean": 6.250795364379883, "geo/layer_0/attn_entropy_std": 0.43930789828300476, "geo/layer_7/stable_rank_q_proj": 42.030399322509766, "geo/layer_7/stable_rank_k_proj": 38.94492721557617, "geo/layer_7/stable_rank_o_proj": 89.87028503417969, "geo/layer_7/stable_rank_gate_proj": 78.91231536865234, "geo/layer_7/stable_rank_down_proj": 144.2820587158203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4207165241241455, "geo/layer_7/attn_entropy_mean": 4.710966110229492, "geo/layer_7/attn_entropy_std": 0.7727439403533936, "geo/layer_14/stable_rank_q_proj": 51.816898345947266, "geo/layer_14/stable_rank_k_proj": 42.18831253051758, "geo/layer_14/stable_rank_o_proj": 42.58545684814453, "geo/layer_14/stable_rank_gate_proj": 71.99727630615234, "geo/layer_14/stable_rank_down_proj": 127.0125503540039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3746397793292999, "geo/layer_14/attn_entropy_mean": 5.520648956298828, "geo/layer_14/attn_entropy_std": 0.44811949133872986, "geo/layer_21/stable_rank_q_proj": 39.39763259887695, "geo/layer_21/stable_rank_k_proj": 28.844135284423828, "geo/layer_21/stable_rank_o_proj": 65.84339904785156, "geo/layer_21/stable_rank_gate_proj": 61.86199188232422, "geo/layer_21/stable_rank_down_proj": 49.74033737182617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13496458530426025, "geo/layer_21/attn_entropy_mean": 5.856022834777832, "geo/layer_21/attn_entropy_std": 0.3063998818397522, "geo/layer_27/stable_rank_q_proj": 44.12989807128906, "geo/layer_27/stable_rank_k_proj": 30.448759078979492, "geo/layer_27/stable_rank_o_proj": 108.17871856689453, "geo/layer_27/stable_rank_gate_proj": 72.07989501953125, "geo/layer_27/stable_rank_down_proj": 127.95602416992188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10278867185115814, "geo/layer_27/attn_entropy_mean": 4.304834365844727, "geo/layer_27/attn_entropy_std": 0.6663022041320801, "attnres/final_alpha/block_0": 0.2541041672229767, "attnres/block_norm/0": 1.7779911756515503, "attnres/final_alpha/block_1": 0.00401937123388052, "attnres/block_norm/1": 50168.6796875, "attnres/final_alpha/block_2": 0.008601892739534378, "attnres/block_norm/2": 29898.6796875, "attnres/final_alpha/block_3": 0.010313020087778568, "attnres/block_norm/3": 70733.2421875, "attnres/final_alpha/block_4": 0.012088924646377563, "attnres/block_norm/4": 16946.984375, "attnres/final_alpha/block_5": 0.6093965768814087, "attnres/block_norm/5": 7101.2294921875, "attnres/final_alpha/block_6": 0.10147607326507568, "attnres/block_norm/6": 47145.5, "geo/tier1_time_s": 1.3598432540893555, "geo/step": 7200.0, "geo/rankme_slope": 0.0006351076758828531} {"step": 7210, "timestamp": 1778333509.4690526, "train/loss": 2.3522374391555787, "train/z_loss": 0.001361845142673701, "train/perplexity": 10.509056814667279, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1732451.5352415047, "perf/iters_per_sec": 0.8260972667892001, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2105112075805664, "data/tokens_consumed": 15122563072, "data/tokens_consumed_B": 15.122563072, "train/loss_slope": -5.2874188576236234e-06} {"step": 7220, "timestamp": 1778333519.8500807, "train/loss": 2.352533197402954, "train/z_loss": 0.0013689951621927321, "train/perplexity": 10.512165414566724, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021395.0365091658, "perf/iters_per_sec": 0.9638762648149327, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374775648117065, "data/tokens_consumed": 15143534592, "data/tokens_consumed_B": 15.143534592, "train/loss_slope": -9.22394299557209e-06} {"step": 7230, "timestamp": 1778333530.220708, "train/loss": 2.346331977844238, "train/z_loss": 0.0013535892590880393, "train/perplexity": 10.447178874949039, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023182.077585912, "perf/iters_per_sec": 0.9647283924035607, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365611791610718, "data/tokens_consumed": 15164506112, "data/tokens_consumed_B": 15.164506112, "train/loss_slope": -8.74874236309736e-06} {"step": 7240, "timestamp": 1778333540.593242, "train/loss": 2.3387231111526487, "train/z_loss": 0.0013541301479563117, "train/perplexity": 10.367989336960507, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023208.9751986782, "perf/iters_per_sec": 0.9647412181847945, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365473985671998, "data/tokens_consumed": 15185477632, "data/tokens_consumed_B": 15.185477632, "train/loss_slope": -8.090867546990271e-06} {"step": 7250, "timestamp": 1778333550.961907, "grad/layer_0/attn": 0.0032087478321045637, "grad/layer_0/mlp": 0.0033619985915720463, "grad/layer_0/attn_mlp_ratio": 0.954416740300448, "grad/layer_4/attn": 0.0022891503758728504, "grad/layer_4/mlp": 0.0027054438833147287, "grad/layer_4/attn_mlp_ratio": 0.8461274341626425, "grad/layer_8/attn": 0.003895372850820422, "grad/layer_8/mlp": 0.003421311965212226, "grad/layer_8/attn_mlp_ratio": 1.138561106549861, "grad/layer_12/attn": 0.008317982777953148, "grad/layer_12/mlp": 0.007659099064767361, "grad/layer_12/attn_mlp_ratio": 1.0860262543951278, "grad/layer_16/attn": 0.004237940534949303, "grad/layer_16/mlp": 0.004767958074808121, "grad/layer_16/attn_mlp_ratio": 0.8888376071209669, "grad/layer_20/attn": 0.002741043921560049, "grad/layer_20/mlp": 0.005857177544385195, "grad/layer_20/attn_mlp_ratio": 0.46798032909035425, "grad/layer_24/attn": 0.004483294673264027, "grad/layer_24/mlp": 0.00736373383551836, "grad/layer_24/attn_mlp_ratio": 0.608834419130657, "grad/layer_27/attn": 0.004923664964735508, "grad/layer_27/mlp": 0.006728400010615587, "grad/layer_27/attn_mlp_ratio": 0.7317735098671232} {"step": 7250, "timestamp": 1778333550.9774516, "train/loss": 2.3170916080474853, "train/z_loss": 0.0013659588410519063, "train/perplexity": 10.146122451828635, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020558.534315755, "perf/iters_per_sec": 0.963477389486196, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037907075881958, "data/tokens_consumed": 15206449152, "data/tokens_consumed_B": 15.206449152, "train/loss_slope": -7.921612087470631e-06} {"step": 7260, "timestamp": 1778333561.3488402, "train/loss": 2.3844730615615846, "train/z_loss": 0.0013575360062532127, "train/perplexity": 10.853342114718469, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022980.182181284, "perf/iters_per_sec": 0.9646321211725636, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366646289825439, "data/tokens_consumed": 15227420672, "data/tokens_consumed_B": 15.227420672, "train/loss_slope": -5.188211663649859e-06} {"step": 7270, "timestamp": 1778333571.7222693, "train/loss": 2.339191436767578, "train/z_loss": 0.0013450108701363206, "train/perplexity": 10.37284606911959, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022956.7799965085, "perf/iters_per_sec": 0.9646209621412795, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366766214370728, "data/tokens_consumed": 15248392192, "data/tokens_consumed_B": 15.248392192, "train/loss_slope": -6.801520887524223e-06} {"step": 7275, "timestamp": 1778333577.496666, "eos/sharpness": 6.047105789184569, "eos/L0_probe": 2.3361973762512207, "eos/L_plus": 2.38181471824646, "eos/L_minus": 2.351051092147827, "eos/grad_norm": 0.09553800523281097, "eos/embed_grad_frac": 0.2623000144958496, "eos/time_s": 0.5949850082397461} {"step": 7275, "timestamp": 1778333578.8737695, "geo/rankme_last": 428.26983642578125, "geo/layer_0/stable_rank_q_proj": 20.74689483642578, "geo/layer_0/stable_rank_k_proj": 17.226728439331055, "geo/layer_0/stable_rank_o_proj": 44.861061096191406, "geo/layer_0/stable_rank_gate_proj": 128.31813049316406, "geo/layer_0/stable_rank_down_proj": 56.77513885498047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06450831145048141, "geo/layer_0/attn_entropy_mean": 6.249222278594971, "geo/layer_0/attn_entropy_std": 0.4389995038509369, "geo/layer_7/stable_rank_q_proj": 42.049278259277344, "geo/layer_7/stable_rank_k_proj": 38.925174713134766, "geo/layer_7/stable_rank_o_proj": 89.7635269165039, "geo/layer_7/stable_rank_gate_proj": 78.92483520507812, "geo/layer_7/stable_rank_down_proj": 144.4584197998047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41816043853759766, "geo/layer_7/attn_entropy_mean": 4.726376533508301, "geo/layer_7/attn_entropy_std": 0.7603698968887329, "geo/layer_14/stable_rank_q_proj": 51.77926254272461, "geo/layer_14/stable_rank_k_proj": 42.14752197265625, "geo/layer_14/stable_rank_o_proj": 42.52076721191406, "geo/layer_14/stable_rank_gate_proj": 71.96833038330078, "geo/layer_14/stable_rank_down_proj": 127.13685607910156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37397369742393494, "geo/layer_14/attn_entropy_mean": 5.508685111999512, "geo/layer_14/attn_entropy_std": 0.454402357339859, "geo/layer_21/stable_rank_q_proj": 39.40371322631836, "geo/layer_21/stable_rank_k_proj": 28.72529411315918, "geo/layer_21/stable_rank_o_proj": 65.7772445678711, "geo/layer_21/stable_rank_gate_proj": 61.81229019165039, "geo/layer_21/stable_rank_down_proj": 49.68777847290039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1331907957792282, "geo/layer_21/attn_entropy_mean": 5.8489885330200195, "geo/layer_21/attn_entropy_std": 0.3204210102558136, "geo/layer_27/stable_rank_q_proj": 44.13718795776367, "geo/layer_27/stable_rank_k_proj": 30.468708038330078, "geo/layer_27/stable_rank_o_proj": 108.24661254882812, "geo/layer_27/stable_rank_gate_proj": 72.01298522949219, "geo/layer_27/stable_rank_down_proj": 128.0337371826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10031512379646301, "geo/layer_27/attn_entropy_mean": 4.325943946838379, "geo/layer_27/attn_entropy_std": 0.6508681774139404, "attnres/final_alpha/block_0": 0.2545994222164154, "attnres/block_norm/0": 1.7780041694641113, "attnres/final_alpha/block_1": 0.003990158438682556, "attnres/block_norm/1": 50000.265625, "attnres/final_alpha/block_2": 0.008678684942424297, "attnres/block_norm/2": 29757.9921875, "attnres/final_alpha/block_3": 0.010619761422276497, "attnres/block_norm/3": 69654.0703125, "attnres/final_alpha/block_4": 0.012210838496685028, "attnres/block_norm/4": 17056.724609375, "attnres/final_alpha/block_5": 0.6079572439193726, "attnres/block_norm/5": 7209.3876953125, "attnres/final_alpha/block_6": 0.10194391757249832, "attnres/block_norm/6": 47233.76171875, "geo/tier1_time_s": 1.3573095798492432, "geo/step": 7275.0, "geo/rankme_slope": 0.0006159832487682573} {"step": 7280, "timestamp": 1778333584.061921, "train/loss": 2.3462589025497436, "train/z_loss": 0.0013628410291858018, "train/perplexity": 10.446415472169395, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700387.5420224317, "perf/iters_per_sec": 0.8108079633819731, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.233337664604187, "data/tokens_consumed": 15269363712, "data/tokens_consumed_B": 15.269363712, "train/loss_slope": -7.855233479433146e-06} {"step": 7290, "timestamp": 1778333594.43638, "train/loss": 2.250003457069397, "train/z_loss": 0.0013675177819095553, "train/perplexity": 9.487768636176428, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022671.1587936664, "perf/iters_per_sec": 0.9644847673385937, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368230104446412, "data/tokens_consumed": 15290335232, "data/tokens_consumed_B": 15.290335232, "train/loss_slope": -1.567568497152754e-05} {"step": 7300, "timestamp": 1778333604.7967355, "grad/layer_0/attn": 0.0027261199429631233, "grad/layer_0/mlp": 0.002956319833174348, "grad/layer_0/attn_mlp_ratio": 0.9221329235621507, "grad/layer_4/attn": 0.001950768637470901, "grad/layer_4/mlp": 0.002520986134186387, "grad/layer_4/attn_mlp_ratio": 0.7738116975876642, "grad/layer_8/attn": 0.005074523855000734, "grad/layer_8/mlp": 0.0034445489291101694, "grad/layer_8/attn_mlp_ratio": 1.4732041298049505, "grad/layer_12/attn": 0.00463466253131628, "grad/layer_12/mlp": 0.006614457815885544, "grad/layer_12/attn_mlp_ratio": 0.7006866761047027, "grad/layer_16/attn": 0.0037560807541012764, "grad/layer_16/mlp": 0.0039442856796085835, "grad/layer_16/attn_mlp_ratio": 0.9522841305058825, "grad/layer_20/attn": 0.0031182661186903715, "grad/layer_20/mlp": 0.005386437755078077, "grad/layer_20/attn_mlp_ratio": 0.5789106275032244, "grad/layer_24/attn": 0.007388871628791094, "grad/layer_24/mlp": 0.009831158444285393, "grad/layer_24/attn_mlp_ratio": 0.7515768966096128, "grad/layer_27/attn": 0.010051283054053783, "grad/layer_27/mlp": 0.007491925731301308, "grad/layer_27/attn_mlp_ratio": 1.3416153977471943} {"step": 7300, "timestamp": 1778333604.8122659, "train/loss": 2.366376209259033, "train/z_loss": 0.0013523444882594048, "train/perplexity": 10.658697326558569, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022427.5141704082, "perf/iters_per_sec": 0.9643685885288278, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369479179382324, "data/tokens_consumed": 15311306752, "data/tokens_consumed_B": 15.311306752, "train/loss_slope": -1.3294193961403572e-05} {"step": 7310, "timestamp": 1778333615.1885478, "train/loss": 2.3220648050308226, "train/z_loss": 0.0013574069133028388, "train/perplexity": 10.196706796095604, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022129.2120023272, "perf/iters_per_sec": 0.9642263469707142, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037100887298584, "data/tokens_consumed": 15332278272, "data/tokens_consumed_B": 15.332278272, "train/loss_slope": -1.6641418689941816e-05} {"step": 7320, "timestamp": 1778333625.5573747, "train/loss": 2.374894952774048, "train/z_loss": 0.0013615889591164887, "train/perplexity": 10.74988388128119, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023900.5545046923, "perf/iters_per_sec": 0.9650709888957464, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361932039260864, "data/tokens_consumed": 15353249792, "data/tokens_consumed_B": 15.353249792, "train/loss_slope": -1.3757137360960518e-05} {"step": 7330, "timestamp": 1778333635.9270434, "train/loss": 2.367728424072266, "train/z_loss": 0.001352884212974459, "train/perplexity": 10.673119924000634, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023602.7953140517, "perf/iters_per_sec": 0.9649290062494524, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363456726074218, "data/tokens_consumed": 15374221312, "data/tokens_consumed_B": 15.374221312, "train/loss_slope": -1.3874406897553152e-05} {"step": 7340, "timestamp": 1778333646.3099067, "train/loss": 2.3257063150405886, "train/z_loss": 0.0013480284949764609, "train/perplexity": 10.233905895299447, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021037.180489027, "perf/iters_per_sec": 0.9637056257672438, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376612663269043, "data/tokens_consumed": 15395192832, "data/tokens_consumed_B": 15.395192832, "train/loss_slope": -1.7355813688249004e-05} {"step": 7350, "timestamp": 1778333657.1019723, "grad/layer_0/attn": 0.0030122289899736643, "grad/layer_0/mlp": 0.003354001557454467, "grad/layer_0/attn_mlp_ratio": 0.89810003023666, "grad/layer_4/attn": 0.0016065065283328295, "grad/layer_4/mlp": 0.0026213452219963074, "grad/layer_4/attn_mlp_ratio": 0.6128557404674104, "grad/layer_8/attn": 0.0033516648691147566, "grad/layer_8/mlp": 0.0034869832452386618, "grad/layer_8/attn_mlp_ratio": 0.9611932542469188, "grad/layer_12/attn": 0.00548115698620677, "grad/layer_12/mlp": 0.0069335405714809895, "grad/layer_12/attn_mlp_ratio": 0.790527848023138, "grad/layer_16/attn": 0.003888472216203809, "grad/layer_16/mlp": 0.004251645412296057, "grad/layer_16/attn_mlp_ratio": 0.9145805323981209, "grad/layer_20/attn": 0.0027887625619769096, "grad/layer_20/mlp": 0.006170929409563541, "grad/layer_20/attn_mlp_ratio": 0.45191936768277136, "grad/layer_24/attn": 0.010682707652449608, "grad/layer_24/mlp": 0.009440574795007706, "grad/layer_24/attn_mlp_ratio": 1.131573847065051, "grad/layer_27/attn": 0.004353572614490986, "grad/layer_27/mlp": 0.009079559706151485, "grad/layer_27/attn_mlp_ratio": 0.4794915951257241} {"step": 7350, "timestamp": 1778333657.7095995, "eos/sharpness": 59.643244743347154, "eos/L0_probe": 2.3271074295043945, "eos/L_plus": 2.677919864654541, "eos/L_minus": 2.5727274417877197, "eos/grad_norm": 0.15719512104988098, "eos/embed_grad_frac": 0.08805947005748749, "eos/time_s": 0.6047322750091553} {"step": 7350, "timestamp": 1778333657.7309275, "train/loss": 2.3768769979476927, "train/z_loss": 0.0013475871295668186, "train/perplexity": 10.771211766178766, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1837241.6264688326, "perf/iters_per_sec": 0.8760650760978854, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1414677143096923, "data/tokens_consumed": 15416164352, "data/tokens_consumed_B": 15.416164352, "train/loss_slope": -1.6203109172954948e-05} {"step": 7350, "timestamp": 1778333659.093551, "geo/rankme_last": 428.5447692871094, "geo/layer_0/stable_rank_q_proj": 20.73691749572754, "geo/layer_0/stable_rank_k_proj": 17.20241928100586, "geo/layer_0/stable_rank_o_proj": 44.89674377441406, "geo/layer_0/stable_rank_gate_proj": 128.32562255859375, "geo/layer_0/stable_rank_down_proj": 56.826271057128906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06496529281139374, "geo/layer_0/attn_entropy_mean": 6.241605758666992, "geo/layer_0/attn_entropy_std": 0.4389161169528961, "geo/layer_7/stable_rank_q_proj": 42.039302825927734, "geo/layer_7/stable_rank_k_proj": 39.021095275878906, "geo/layer_7/stable_rank_o_proj": 89.62799072265625, "geo/layer_7/stable_rank_gate_proj": 78.91766357421875, "geo/layer_7/stable_rank_down_proj": 144.20986938476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40951624512672424, "geo/layer_7/attn_entropy_mean": 4.708378314971924, "geo/layer_7/attn_entropy_std": 0.7513962388038635, "geo/layer_14/stable_rank_q_proj": 51.692256927490234, "geo/layer_14/stable_rank_k_proj": 42.09395217895508, "geo/layer_14/stable_rank_o_proj": 42.45248031616211, "geo/layer_14/stable_rank_gate_proj": 72.1491928100586, "geo/layer_14/stable_rank_down_proj": 127.1021957397461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36586621403694153, "geo/layer_14/attn_entropy_mean": 5.543890953063965, "geo/layer_14/attn_entropy_std": 0.43599146604537964, "geo/layer_21/stable_rank_q_proj": 39.41938018798828, "geo/layer_21/stable_rank_k_proj": 28.808738708496094, "geo/layer_21/stable_rank_o_proj": 65.7684326171875, "geo/layer_21/stable_rank_gate_proj": 61.74664306640625, "geo/layer_21/stable_rank_down_proj": 49.69782257080078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13437113165855408, "geo/layer_21/attn_entropy_mean": 5.8678388595581055, "geo/layer_21/attn_entropy_std": 0.31456106901168823, "geo/layer_27/stable_rank_q_proj": 44.016082763671875, "geo/layer_27/stable_rank_k_proj": 30.435277938842773, "geo/layer_27/stable_rank_o_proj": 108.34907531738281, "geo/layer_27/stable_rank_gate_proj": 71.94290161132812, "geo/layer_27/stable_rank_down_proj": 128.02279663085938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10239578783512115, "geo/layer_27/attn_entropy_mean": 4.323297500610352, "geo/layer_27/attn_entropy_std": 0.639137327671051, "attnres/final_alpha/block_0": 0.25245001912117004, "attnres/block_norm/0": 1.7780208587646484, "attnres/final_alpha/block_1": 0.003910171799361706, "attnres/block_norm/1": 50011.8203125, "attnres/final_alpha/block_2": 0.008668782189488411, "attnres/block_norm/2": 29786.4453125, "attnres/final_alpha/block_3": 0.010574592277407646, "attnres/block_norm/3": 69716.609375, "attnres/final_alpha/block_4": 0.012171905487775803, "attnres/block_norm/4": 17111.052734375, "attnres/final_alpha/block_5": 0.6120299100875854, "attnres/block_norm/5": 7105.11767578125, "attnres/final_alpha/block_6": 0.10019459575414658, "attnres/block_norm/6": 47232.36328125, "geo/tier1_time_s": 1.3586981296539307, "geo/step": 7350.0, "geo/rankme_slope": 0.0006315941806410064} {"step": 7360, "timestamp": 1778333669.4692008, "train/loss": 2.3890971899032594, "train/z_loss": 0.0013566970941610635, "train/perplexity": 10.903645576790645, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787176.8415148905, "perf/iters_per_sec": 0.8521923263143971, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1734440326690674, "data/tokens_consumed": 15437135872, "data/tokens_consumed_B": 15.437135872, "train/loss_slope": -1.259835526780825e-05} {"step": 7370, "timestamp": 1778333679.844648, "train/loss": 2.356023406982422, "train/z_loss": 0.0013445571181364357, "train/perplexity": 10.548919176854355, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022675.0657708521, "perf/iters_per_sec": 0.9644866303304921, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368210077285767, "data/tokens_consumed": 15458107392, "data/tokens_consumed_B": 15.458107392, "train/loss_slope": -1.3828902025677702e-05} {"step": 7380, "timestamp": 1778333690.2389376, "train/loss": 2.3333499908447264, "train/z_loss": 0.0013396834838204086, "train/perplexity": 10.312430279319932, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018869.9506912983, "perf/iters_per_sec": 0.9626722100693218, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0387751817703248, "data/tokens_consumed": 15479078912, "data/tokens_consumed_B": 15.479078912, "train/loss_slope": -1.388371173161625e-05} {"step": 7390, "timestamp": 1778333700.6099813, "train/loss": 2.2891191244125366, "train/z_loss": 0.001367192156612873, "train/perplexity": 9.866242919705892, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023064.4438357733, "perf/iters_per_sec": 0.9646723002604357, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366214513778687, "data/tokens_consumed": 15500050432, "data/tokens_consumed_B": 15.500050432, "train/loss_slope": -1.693773972104697e-05} {"step": 7400, "timestamp": 1778333710.9709, "grad/layer_0/attn": 0.0029777411837130785, "grad/layer_0/mlp": 0.0033136245328933, "grad/layer_0/attn_mlp_ratio": 0.8986356372879375, "grad/layer_4/attn": 0.002105934778228402, "grad/layer_4/mlp": 0.002663474529981613, "grad/layer_4/attn_mlp_ratio": 0.7906719870813788, "grad/layer_8/attn": 0.006066919304430485, "grad/layer_8/mlp": 0.0033845258876681328, "grad/layer_8/attn_mlp_ratio": 1.7925462314474543, "grad/layer_12/attn": 0.005859910510480404, "grad/layer_12/mlp": 0.007784872781485319, "grad/layer_12/attn_mlp_ratio": 0.7527304041684441, "grad/layer_16/attn": 0.004085531923919916, "grad/layer_16/mlp": 0.004940949846059084, "grad/layer_16/attn_mlp_ratio": 0.8268717490608357, "grad/layer_20/attn": 0.003851425601169467, "grad/layer_20/mlp": 0.0063208602368831635, "grad/layer_20/attn_mlp_ratio": 0.6093198387402778, "grad/layer_24/attn": 0.007444296032190323, "grad/layer_24/mlp": 0.007902662269771099, "grad/layer_24/attn_mlp_ratio": 0.9419984916306057, "grad/layer_27/attn": 0.009906289167702198, "grad/layer_27/mlp": 0.008700772188603878, "grad/layer_27/attn_mlp_ratio": 1.138552859345289} {"step": 7400, "timestamp": 1778333710.9863632, "train/loss": 2.3409178256988525, "train/z_loss": 0.0013581071863882244, "train/perplexity": 10.390769102370728, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021971.21631845, "perf/iters_per_sec": 0.9641510087578058, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371819257736206, "data/tokens_consumed": 15521021952, "data/tokens_consumed_B": 15.521021952, "train/loss_slope": -1.682990775464569e-05} {"step": 7410, "timestamp": 1778333721.363738, "train/loss": 2.393339467048645, "train/z_loss": 0.0013450034079141916, "train/perplexity": 10.950000118108182, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022404.1247705305, "perf/iters_per_sec": 0.964357435593858, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369599103927611, "data/tokens_consumed": 15541993472, "data/tokens_consumed_B": 15.541993472, "train/loss_slope": -1.3212586579435813e-05} {"step": 7420, "timestamp": 1778333731.70718, "train/loss": 2.316844916343689, "train/z_loss": 0.0013550232979469001, "train/perplexity": 10.143619796298939, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028632.6669662227, "perf/iters_per_sec": 0.9673274359542001, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337761163711547, "data/tokens_consumed": 15562964992, "data/tokens_consumed_B": 15.562964992, "train/loss_slope": -1.4922847670547364e-05} {"step": 7425, "timestamp": 1778333737.539022, "eos/sharpness": 43.35558414459228, "eos/L0_probe": 2.3205997943878174, "eos/L_plus": 2.5484073162078857, "eos/L_minus": 2.526348114013672, "eos/grad_norm": 0.16009613871574402, "eos/embed_grad_frac": 0.11229540407657623, "eos/time_s": 0.6702401638031006} {"step": 7425, "timestamp": 1778333738.9177125, "geo/rankme_last": 428.73895263671875, "geo/layer_0/stable_rank_q_proj": 20.708887100219727, "geo/layer_0/stable_rank_k_proj": 17.203855514526367, "geo/layer_0/stable_rank_o_proj": 44.80543518066406, "geo/layer_0/stable_rank_gate_proj": 128.21163940429688, "geo/layer_0/stable_rank_down_proj": 56.86246109008789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062397927045822144, "geo/layer_0/attn_entropy_mean": 6.243983745574951, "geo/layer_0/attn_entropy_std": 0.43246403336524963, "geo/layer_7/stable_rank_q_proj": 41.968719482421875, "geo/layer_7/stable_rank_k_proj": 38.98124313354492, "geo/layer_7/stable_rank_o_proj": 89.63622283935547, "geo/layer_7/stable_rank_gate_proj": 78.87738800048828, "geo/layer_7/stable_rank_down_proj": 144.2725830078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41324418783187866, "geo/layer_7/attn_entropy_mean": 4.745049476623535, "geo/layer_7/attn_entropy_std": 0.770575225353241, "geo/layer_14/stable_rank_q_proj": 51.719581604003906, "geo/layer_14/stable_rank_k_proj": 42.16738510131836, "geo/layer_14/stable_rank_o_proj": 42.43409729003906, "geo/layer_14/stable_rank_gate_proj": 72.14491271972656, "geo/layer_14/stable_rank_down_proj": 127.14268493652344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36324602365493774, "geo/layer_14/attn_entropy_mean": 5.525374412536621, "geo/layer_14/attn_entropy_std": 0.4485413432121277, "geo/layer_21/stable_rank_q_proj": 39.3735237121582, "geo/layer_21/stable_rank_k_proj": 28.846601486206055, "geo/layer_21/stable_rank_o_proj": 65.6785659790039, "geo/layer_21/stable_rank_gate_proj": 61.769405364990234, "geo/layer_21/stable_rank_down_proj": 49.729148864746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13364067673683167, "geo/layer_21/attn_entropy_mean": 5.867514133453369, "geo/layer_21/attn_entropy_std": 0.32921624183654785, "geo/layer_27/stable_rank_q_proj": 44.07468032836914, "geo/layer_27/stable_rank_k_proj": 30.42473602294922, "geo/layer_27/stable_rank_o_proj": 108.26068878173828, "geo/layer_27/stable_rank_gate_proj": 71.90243530273438, "geo/layer_27/stable_rank_down_proj": 128.0228271484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10333635658025742, "geo/layer_27/attn_entropy_mean": 4.3104705810546875, "geo/layer_27/attn_entropy_std": 0.6699069738388062, "attnres/final_alpha/block_0": 0.2544786036014557, "attnres/block_norm/0": 1.7779384851455688, "attnres/final_alpha/block_1": 0.004017631523311138, "attnres/block_norm/1": 50079.9609375, "attnres/final_alpha/block_2": 0.00866696797311306, "attnres/block_norm/2": 29737.814453125, "attnres/final_alpha/block_3": 0.010494949296116829, "attnres/block_norm/3": 69922.8046875, "attnres/final_alpha/block_4": 0.012045128270983696, "attnres/block_norm/4": 17014.046875, "attnres/final_alpha/block_5": 0.6093437075614929, "attnres/block_norm/5": 7144.5009765625, "attnres/final_alpha/block_6": 0.10095298290252686, "attnres/block_norm/6": 47098.2109375, "geo/tier1_time_s": 1.3601341247558594, "geo/step": 7425.0, "geo/rankme_slope": 0.0006262441304646859} {"step": 7430, "timestamp": 1778333744.1008434, "train/loss": 2.356152892112732, "train/z_loss": 0.0013547054375521838, "train/perplexity": 10.55028519346611, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1692775.6371840525, "perf/iters_per_sec": 0.8071783243103278, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2388836145401, "data/tokens_consumed": 15583936512, "data/tokens_consumed_B": 15.583936512, "train/loss_slope": -1.6567902117207557e-05} {"step": 7440, "timestamp": 1778333754.4626386, "train/loss": 2.3355430603027343, "train/z_loss": 0.0013559957500547172, "train/perplexity": 10.335070972435332, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024935.0345296576, "perf/iters_per_sec": 0.9655642674110687, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356638431549072, "data/tokens_consumed": 15604908032, "data/tokens_consumed_B": 15.604908032, "train/loss_slope": -1.7241712436281156e-05} {"step": 7450, "timestamp": 1778333764.7987447, "grad/layer_0/attn": 0.0029119662940502167, "grad/layer_0/mlp": 0.0029850078281015158, "grad/layer_0/attn_mlp_ratio": 0.9755305058443332, "grad/layer_4/attn": 0.0018781922990456223, "grad/layer_4/mlp": 0.002559022745117545, "grad/layer_4/attn_mlp_ratio": 0.7339490159804924, "grad/layer_8/attn": 0.0032183080911636353, "grad/layer_8/mlp": 0.0034311129711568356, "grad/layer_8/attn_mlp_ratio": 0.9379778586190833, "grad/layer_12/attn": 0.005128263495862484, "grad/layer_12/mlp": 0.007166222203522921, "grad/layer_12/attn_mlp_ratio": 0.7156160217554828, "grad/layer_16/attn": 0.003104167291894555, "grad/layer_16/mlp": 0.004424586892127991, "grad/layer_16/attn_mlp_ratio": 0.7015722139529268, "grad/layer_20/attn": 0.0025672127958387136, "grad/layer_20/mlp": 0.005934764165431261, "grad/layer_20/attn_mlp_ratio": 0.43257199124018825, "grad/layer_24/attn": 0.011378300376236439, "grad/layer_24/mlp": 0.010139617137610912, "grad/layer_24/attn_mlp_ratio": 1.1221627118261304, "grad/layer_27/attn": 0.007215735502541065, "grad/layer_27/mlp": 0.01057172380387783, "grad/layer_27/attn_mlp_ratio": 0.6825505062513267} {"step": 7450, "timestamp": 1778333764.8147051, "train/loss": 2.3388490676879883, "train/z_loss": 0.001355147990398109, "train/perplexity": 10.369295335223608, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026610.4833193703, "perf/iters_per_sec": 0.9663631836506702, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348076343536377, "data/tokens_consumed": 15625879552, "data/tokens_consumed_B": 15.625879552, "train/loss_slope": -1.873112165495114e-05} {"step": 7460, "timestamp": 1778333775.1678326, "train/loss": 2.3500828981399535, "train/z_loss": 0.001350596291013062, "train/perplexity": 10.486438994984054, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026632.7560727433, "perf/iters_per_sec": 0.966373804127094, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347962617874145, "data/tokens_consumed": 15646851072, "data/tokens_consumed_B": 15.646851072, "train/loss_slope": -1.7284263979376467e-05} {"step": 7470, "timestamp": 1778333785.5360823, "train/loss": 2.3623869895935057, "train/z_loss": 0.0013576553435996174, "train/perplexity": 10.616262139491605, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023946.7045447798, "perf/iters_per_sec": 0.9650929949497127, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361695766448975, "data/tokens_consumed": 15667822592, "data/tokens_consumed_B": 15.667822592, "train/loss_slope": -1.705408330940823e-05} {"step": 7480, "timestamp": 1778333795.882188, "train/loss": 2.328175759315491, "train/z_loss": 0.0013579324469901622, "train/perplexity": 10.259209185296179, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027953.1859031261, "perf/iters_per_sec": 0.9670034341350203, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341224908828734, "data/tokens_consumed": 15688794112, "data/tokens_consumed_B": 15.688794112, "train/loss_slope": -1.595400965610785e-05} {"step": 7490, "timestamp": 1778333806.6841424, "train/loss": 2.3852120876312255, "train/z_loss": 0.0013492179452441633, "train/perplexity": 10.86136598204235, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1942376.7272269889, "perf/iters_per_sec": 0.9261973987708039, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0796834468841552, "data/tokens_consumed": 15709765632, "data/tokens_consumed_B": 15.709765632, "train/loss_slope": -9.817264017814167e-06} {"step": 7500, "timestamp": 1778333817.0279472, "grad/layer_0/attn": 0.003795757656916976, "grad/layer_0/mlp": 0.003693881444633007, "grad/layer_0/attn_mlp_ratio": 1.0275796911874417, "grad/layer_4/attn": 0.002789299236610532, "grad/layer_4/mlp": 0.0030025371816009283, "grad/layer_4/attn_mlp_ratio": 0.9289807169765767, "grad/layer_8/attn": 0.00786328874528408, "grad/layer_8/mlp": 0.0039364732801914215, "grad/layer_8/attn_mlp_ratio": 1.997546531078462, "grad/layer_12/attn": 0.010104943998157978, "grad/layer_12/mlp": 0.008001027628779411, "grad/layer_12/attn_mlp_ratio": 1.2629557527729662, "grad/layer_16/attn": 0.004448090214282274, "grad/layer_16/mlp": 0.005956599488854408, "grad/layer_16/attn_mlp_ratio": 0.7467499112421866, "grad/layer_20/attn": 0.0033501763828098774, "grad/layer_20/mlp": 0.007300877943634987, "grad/layer_20/attn_mlp_ratio": 0.45887307838686753, "grad/layer_24/attn": 0.018253112211823463, "grad/layer_24/mlp": 0.014160355553030968, "grad/layer_24/attn_mlp_ratio": 1.2890292206690768, "grad/layer_27/attn": 0.009939173236489296, "grad/layer_27/mlp": 0.015717169269919395, "grad/layer_27/attn_mlp_ratio": 0.6323767977910559} {"step": 7500, "timestamp": 1778333817.6313157, "eos/sharpness": 68.19641590118407, "eos/L0_probe": 2.3249926567077637, "eos/L_plus": 2.647388219833374, "eos/L_minus": 2.684561252593994, "eos/grad_norm": 0.2678891718387604, "eos/embed_grad_frac": 0.042270246893167496, "eos/time_s": 0.6004829406738281} {"step": 7500, "timestamp": 1778333817.6510131, "train/loss": 2.3409596920013427, "train/z_loss": 0.0013526612892746925, "train/perplexity": 10.391204134559604, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913481.2009547553, "perf/iters_per_sec": 0.9124189381383683, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0959877729415894, "data/tokens_consumed": 15730737152, "data/tokens_consumed_B": 15.730737152, "train/loss_slope": -1.1978662074810437e-05} {"step": 7500, "timestamp": 1778333819.0174348, "geo/rankme_last": 427.7771301269531, "geo/layer_0/stable_rank_q_proj": 20.756811141967773, "geo/layer_0/stable_rank_k_proj": 17.197006225585938, "geo/layer_0/stable_rank_o_proj": 44.819480895996094, "geo/layer_0/stable_rank_gate_proj": 128.21165466308594, "geo/layer_0/stable_rank_down_proj": 56.89487838745117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06453551352024078, "geo/layer_0/attn_entropy_mean": 6.246415138244629, "geo/layer_0/attn_entropy_std": 0.436565101146698, "geo/layer_7/stable_rank_q_proj": 42.029075622558594, "geo/layer_7/stable_rank_k_proj": 38.98779296875, "geo/layer_7/stable_rank_o_proj": 89.68758392333984, "geo/layer_7/stable_rank_gate_proj": 78.7740249633789, "geo/layer_7/stable_rank_down_proj": 144.4628448486328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4201701283454895, "geo/layer_7/attn_entropy_mean": 4.725525856018066, "geo/layer_7/attn_entropy_std": 0.7721564769744873, "geo/layer_14/stable_rank_q_proj": 51.731849670410156, "geo/layer_14/stable_rank_k_proj": 42.2508544921875, "geo/layer_14/stable_rank_o_proj": 42.42890930175781, "geo/layer_14/stable_rank_gate_proj": 72.1118392944336, "geo/layer_14/stable_rank_down_proj": 127.08112335205078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36574721336364746, "geo/layer_14/attn_entropy_mean": 5.498076438903809, "geo/layer_14/attn_entropy_std": 0.4600224792957306, "geo/layer_21/stable_rank_q_proj": 39.301273345947266, "geo/layer_21/stable_rank_k_proj": 28.867977142333984, "geo/layer_21/stable_rank_o_proj": 65.6355972290039, "geo/layer_21/stable_rank_gate_proj": 61.65974426269531, "geo/layer_21/stable_rank_down_proj": 49.67995071411133, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13443545997142792, "geo/layer_21/attn_entropy_mean": 5.853013515472412, "geo/layer_21/attn_entropy_std": 0.3169987201690674, "geo/layer_27/stable_rank_q_proj": 44.132450103759766, "geo/layer_27/stable_rank_k_proj": 30.381837844848633, "geo/layer_27/stable_rank_o_proj": 108.12872314453125, "geo/layer_27/stable_rank_gate_proj": 71.90531921386719, "geo/layer_27/stable_rank_down_proj": 128.30775451660156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09467651695013046, "geo/layer_27/attn_entropy_mean": 4.349143028259277, "geo/layer_27/attn_entropy_std": 0.6831191778182983, "attnres/final_alpha/block_0": 0.2559467554092407, "attnres/block_norm/0": 1.7779250144958496, "attnres/final_alpha/block_1": 0.003984266892075539, "attnres/block_norm/1": 49964.203125, "attnres/final_alpha/block_2": 0.008719920180737972, "attnres/block_norm/2": 29770.0859375, "attnres/final_alpha/block_3": 0.010640878230333328, "attnres/block_norm/3": 70204.625, "attnres/final_alpha/block_4": 0.012378772720694542, "attnres/block_norm/4": 17077.056640625, "attnres/final_alpha/block_5": 0.6067367792129517, "attnres/block_norm/5": 7084.14599609375, "attnres/final_alpha/block_6": 0.10159258544445038, "attnres/block_norm/6": 47161.29296875, "geo/tier1_time_s": 1.362287998199463, "geo/step": 7500.0, "geo/rankme_slope": 0.0006038162335246599} {"step": 7500, "timestamp": 1778333825.8548434, "geo/ww_alpha_mean": 7.687745120944724, "geo/ww_alpha_std": 4.612488691441097, "geo/ww_alpha_min": 1.3287349478969488, "geo/ww_alpha_max": 27.01947707346619, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9286950917174503, "geo/ww_alpha_by_type/k_proj": 4.43891682492746, "geo/ww_alpha_by_type/v_proj": 8.91789954180391, "geo/ww_alpha_by_type/o_proj": 8.713144613334988, "geo/ww_alpha_by_type/gate_proj": 7.788322206390411, "geo/ww_alpha_by_type/up_proj": 12.061084177948013, "geo/ww_alpha_by_type/down_proj": 8.062157727534709, "geo/twonn_id/layer_0": 0.7332663536071777, "geo/twonn_id/layer_7": 3.5628256797790527, "geo/twonn_id/layer_14": 4.680995464324951, "geo/twonn_id/layer_21": 7.897984027862549, "geo/twonn_id/layer_27": 6.670839786529541, "geo/tier2_time_s": 6.831074476242065} {"step": 7500, "timestamp": 1778333826.609697, "eoc/jacobian_sigma/layer_0/attn": 1658.761474609375, "eoc/jacobian_sigma/layer_0/mlp": 10736.4404296875, "eoc/jacobian_sigma/layer_0": 10736.4404296875, "eoc/jacobian_sigma/layer_7/attn": 1.1378192901611328, "eoc/jacobian_sigma/layer_7/mlp": 1.9140268564224243, "eoc/jacobian_sigma/layer_7": 1.9140268564224243, "eoc/jacobian_sigma/layer_14/attn": 1.8416167497634888, "eoc/jacobian_sigma/layer_14/mlp": 11.715174674987793, "eoc/jacobian_sigma/layer_14": 11.715174674987793, "eoc/jacobian_sigma/layer_21/attn": 1.098981499671936, "eoc/jacobian_sigma/layer_21/mlp": 4.93204927444458, "eoc/jacobian_sigma/layer_21": 4.93204927444458, "eoc/jacobian_sigma/layer_27/attn": 3.696624994277954, "eoc/jacobian_sigma/layer_27/mlp": 30.370271682739258, "eoc/jacobian_sigma/layer_27": 30.370271682739258, "eoc/layer0_sigma": 10736.4404296875, "eoc/sigma_max": 30.370271682739258, "eoc/sigma_min": 1.9140268564224243, "eoc/sigma_mean": 12.232880622148514, "eoc/time_s": 0.7484962940216064} {"step": 7510, "timestamp": 1778333836.9822435, "train/loss": 2.3573835611343386, "train/z_loss": 0.0013584205182269216, "train/perplexity": 10.563277095348678, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085081.334560431, "perf/iters_per_sec": 0.5174070999910503, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9327141046524048, "data/tokens_consumed": 15751708672, "data/tokens_consumed_B": 15.751708672, "train/loss_slope": -1.2974573361991694e-05} {"step": 7520, "timestamp": 1778333847.3302472, "train/loss": 2.399187183380127, "train/z_loss": 0.001341239793691784, "train/perplexity": 11.01422020003494, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027956.5990079625, "perf/iters_per_sec": 0.9670050616302311, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034120750427246, "data/tokens_consumed": 15772680192, "data/tokens_consumed_B": 15.772680192, "train/loss_slope": -8.649389671556906e-06} {"step": 7530, "timestamp": 1778333857.6813226, "train/loss": 2.360735368728638, "train/z_loss": 0.001355330622754991, "train/perplexity": 10.598742571259896, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027451.537703619, "perf/iters_per_sec": 0.9667642296331497, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343783617019653, "data/tokens_consumed": 15793651712, "data/tokens_consumed_B": 15.793651712, "train/loss_slope": -7.3784813450293626e-06} {"step": 7540, "timestamp": 1778333868.5398853, "train/loss": 2.3454967498779298, "train/z_loss": 0.0013493079924955965, "train/perplexity": 10.438456741975399, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932167.157184595, "perf/iters_per_sec": 0.921329096405313, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0853884935379028, "data/tokens_consumed": 15814623232, "data/tokens_consumed_B": 15.814623232, "train/loss_slope": -9.21841860890497e-06} {"step": 7550, "timestamp": 1778333878.8867326, "grad/layer_0/attn": 0.0025479879695922136, "grad/layer_0/mlp": 0.002922224346548319, "grad/layer_0/attn_mlp_ratio": 0.871934382932787, "grad/layer_4/attn": 0.0019713325891643763, "grad/layer_4/mlp": 0.0024696956388652325, "grad/layer_4/attn_mlp_ratio": 0.798208685443235, "grad/layer_8/attn": 0.003385529387742281, "grad/layer_8/mlp": 0.0034772329963743687, "grad/layer_8/attn_mlp_ratio": 0.9736273910633989, "grad/layer_12/attn": 0.007239403203129768, "grad/layer_12/mlp": 0.00638121273368597, "grad/layer_12/attn_mlp_ratio": 1.1344870311978112, "grad/layer_16/attn": 0.003143147798255086, "grad/layer_16/mlp": 0.004529312252998352, "grad/layer_16/attn_mlp_ratio": 0.6939569526871687, "grad/layer_20/attn": 0.007925071753561497, "grad/layer_20/mlp": 0.006297449581325054, "grad/layer_20/attn_mlp_ratio": 1.2584573366363085, "grad/layer_24/attn": 0.010688656941056252, "grad/layer_24/mlp": 0.009812964126467705, "grad/layer_24/attn_mlp_ratio": 1.0892383477998027, "grad/layer_27/attn": 0.004467268940061331, "grad/layer_27/mlp": 0.010702982544898987, "grad/layer_27/attn_mlp_ratio": 0.41738542313627125} {"step": 7550, "timestamp": 1778333878.9029715, "train/loss": 2.343697023391724, "train/z_loss": 0.001354707032442093, "train/perplexity": 10.419687269925957, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024872.151879275, "perf/iters_per_sec": 0.96553428262676, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356960058212281, "data/tokens_consumed": 15835594752, "data/tokens_consumed_B": 15.835594752, "train/loss_slope": -1.0607903246188573e-05} {"step": 7560, "timestamp": 1778333889.2678514, "train/loss": 2.3922817707061768, "train/z_loss": 0.0013557532685808838, "train/perplexity": 10.938424465874906, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024337.3644073657, "perf/iters_per_sec": 0.9652792760884121, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035969614982605, "data/tokens_consumed": 15856566272, "data/tokens_consumed_B": 15.856566272, "train/loss_slope": -8.266182973010734e-06} {"step": 7570, "timestamp": 1778333899.627565, "train/loss": 2.308277893066406, "train/z_loss": 0.0013551462208852172, "train/perplexity": 10.057090348511887, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025459.7371273045, "perf/iters_per_sec": 0.9658144651066325, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353955507278443, "data/tokens_consumed": 15877537792, "data/tokens_consumed_B": 15.877537792, "train/loss_slope": -1.2009386325290244e-05} {"step": 7575, "timestamp": 1778333905.4027236, "eos/sharpness": 50.34596920013427, "eos/L0_probe": 2.3323020935058594, "eos/L_plus": 2.564425230026245, "eos/L_minus": 2.6036386489868164, "eos/grad_norm": 0.1482856422662735, "eos/embed_grad_frac": 0.11454219371080399, "eos/time_s": 0.6095240116119385} {"step": 7575, "timestamp": 1778333906.784371, "geo/rankme_last": 427.2740173339844, "geo/layer_0/stable_rank_q_proj": 20.755178451538086, "geo/layer_0/stable_rank_k_proj": 17.176006317138672, "geo/layer_0/stable_rank_o_proj": 44.7717399597168, "geo/layer_0/stable_rank_gate_proj": 128.1541290283203, "geo/layer_0/stable_rank_down_proj": 56.88844299316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06309401988983154, "geo/layer_0/attn_entropy_mean": 6.246408939361572, "geo/layer_0/attn_entropy_std": 0.43562740087509155, "geo/layer_7/stable_rank_q_proj": 42.08916473388672, "geo/layer_7/stable_rank_k_proj": 39.06846618652344, "geo/layer_7/stable_rank_o_proj": 89.6268310546875, "geo/layer_7/stable_rank_gate_proj": 78.90200805664062, "geo/layer_7/stable_rank_down_proj": 144.42529296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41048282384872437, "geo/layer_7/attn_entropy_mean": 4.742657661437988, "geo/layer_7/attn_entropy_std": 0.7594342231750488, "geo/layer_14/stable_rank_q_proj": 51.743038177490234, "geo/layer_14/stable_rank_k_proj": 42.33951950073242, "geo/layer_14/stable_rank_o_proj": 42.41011047363281, "geo/layer_14/stable_rank_gate_proj": 72.1413803100586, "geo/layer_14/stable_rank_down_proj": 126.96123504638672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37070250511169434, "geo/layer_14/attn_entropy_mean": 5.508986949920654, "geo/layer_14/attn_entropy_std": 0.4465738534927368, "geo/layer_21/stable_rank_q_proj": 39.325687408447266, "geo/layer_21/stable_rank_k_proj": 28.784528732299805, "geo/layer_21/stable_rank_o_proj": 65.61974334716797, "geo/layer_21/stable_rank_gate_proj": 61.74968719482422, "geo/layer_21/stable_rank_down_proj": 49.69426727294922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.12967894971370697, "geo/layer_21/attn_entropy_mean": 5.862027168273926, "geo/layer_21/attn_entropy_std": 0.3202376365661621, "geo/layer_27/stable_rank_q_proj": 44.169010162353516, "geo/layer_27/stable_rank_k_proj": 30.347057342529297, "geo/layer_27/stable_rank_o_proj": 108.02837371826172, "geo/layer_27/stable_rank_gate_proj": 71.78779602050781, "geo/layer_27/stable_rank_down_proj": 128.25880432128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09805496782064438, "geo/layer_27/attn_entropy_mean": 4.342255115509033, "geo/layer_27/attn_entropy_std": 0.6755309104919434, "attnres/final_alpha/block_0": 0.2553115487098694, "attnres/block_norm/0": 1.7781167030334473, "attnres/final_alpha/block_1": 0.003945502918213606, "attnres/block_norm/1": 49964.41796875, "attnres/final_alpha/block_2": 0.008771972730755806, "attnres/block_norm/2": 29740.453125, "attnres/final_alpha/block_3": 0.010691214352846146, "attnres/block_norm/3": 70040.6796875, "attnres/final_alpha/block_4": 0.012216351926326752, "attnres/block_norm/4": 17019.5859375, "attnres/final_alpha/block_5": 0.6068626046180725, "attnres/block_norm/5": 7147.2109375, "attnres/final_alpha/block_6": 0.10220077633857727, "attnres/block_norm/6": 47053.6796875, "geo/tier1_time_s": 1.3623113632202148, "geo/step": 7575.0, "geo/rankme_slope": 0.000544557490965136} {"step": 7580, "timestamp": 1778333911.960198, "train/loss": 2.3457971811294556, "train/z_loss": 0.0013490754528902471, "train/perplexity": 10.441593251727571, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701492.839560536, "perf/iters_per_sec": 0.8113350103190117, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2325364828109742, "data/tokens_consumed": 15898509312, "data/tokens_consumed_B": 15.898509312, "train/loss_slope": -1.0993280204752341e-05} {"step": 7590, "timestamp": 1778333922.3120527, "train/loss": 2.3554566860198975, "train/z_loss": 0.0013538834871724247, "train/perplexity": 10.542942576917051, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026767.2434761585, "perf/iters_per_sec": 0.9664379327183525, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347275972366332, "data/tokens_consumed": 15919480832, "data/tokens_consumed_B": 15.919480832, "train/loss_slope": -1.0578429716827252e-05} {"step": 7600, "timestamp": 1778333932.6491039, "grad/layer_0/attn": 0.003345369128510356, "grad/layer_0/mlp": 0.003521659877151251, "grad/layer_0/attn_mlp_ratio": 0.9499409795992001, "grad/layer_4/attn": 0.0030120834708213806, "grad/layer_4/mlp": 0.002719793701544404, "grad/layer_4/attn_mlp_ratio": 1.107467584164286, "grad/layer_8/attn": 0.005122407339513302, "grad/layer_8/mlp": 0.0034919718746095896, "grad/layer_8/attn_mlp_ratio": 1.4669096363770213, "grad/layer_12/attn": 0.006386952009052038, "grad/layer_12/mlp": 0.007345038931816816, "grad/layer_12/attn_mlp_ratio": 0.8695599821029416, "grad/layer_16/attn": 0.005128421820700169, "grad/layer_16/mlp": 0.005252140108495951, "grad/layer_16/attn_mlp_ratio": 0.9764441955308709, "grad/layer_20/attn": 0.004710799548774958, "grad/layer_20/mlp": 0.006692482158541679, "grad/layer_20/attn_mlp_ratio": 0.703894215447866, "grad/layer_24/attn": 0.009738665074110031, "grad/layer_24/mlp": 0.009552989155054092, "grad/layer_24/attn_mlp_ratio": 1.0194364103317404, "grad/layer_27/attn": 0.004491484258323908, "grad/layer_27/mlp": 0.007630695588886738, "grad/layer_27/attn_mlp_ratio": 0.5886074404546439} {"step": 7600, "timestamp": 1778333932.6648822, "train/loss": 2.32594153881073, "train/z_loss": 0.0013489480246789754, "train/perplexity": 10.236313436371756, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026920.8517918058, "perf/iters_per_sec": 0.9665111788710622, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346491813659668, "data/tokens_consumed": 15940452352, "data/tokens_consumed_B": 15.940452352, "train/loss_slope": -1.0389627210020173e-05} {"step": 7610, "timestamp": 1778333943.0142405, "train/loss": 2.362829875946045, "train/z_loss": 0.0013467344571836293, "train/perplexity": 10.62096497844289, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027320.043358745, "perf/iters_per_sec": 0.9667015282434201, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344454526901246, "data/tokens_consumed": 15961423872, "data/tokens_consumed_B": 15.961423872, "train/loss_slope": -6.497818794902835e-06} {"step": 7620, "timestamp": 1778333953.3728662, "train/loss": 2.369676113128662, "train/z_loss": 0.001355093577876687, "train/perplexity": 10.6939281002248, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025413.61133737, "perf/iters_per_sec": 0.9657924706160402, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354191303253173, "data/tokens_consumed": 15982395392, "data/tokens_consumed_B": 15.982395392, "train/loss_slope": -5.1404281739819166e-06} {"step": 7630, "timestamp": 1778333963.7306366, "train/loss": 2.3407371759414675, "train/z_loss": 0.0013516151928342878, "train/perplexity": 10.38889218199105, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026085.8844503837, "perf/iters_per_sec": 0.9661130354167861, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035075569152832, "data/tokens_consumed": 16003366912, "data/tokens_consumed_B": 16.003366912, "train/loss_slope": -6.4906228028579305e-06} {"step": 7640, "timestamp": 1778333974.077592, "train/loss": 2.351561951637268, "train/z_loss": 0.0013519052183255554, "train/perplexity": 10.501960474974025, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027821.392738005, "perf/iters_per_sec": 0.9669405902566933, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341897010803223, "data/tokens_consumed": 16024338432, "data/tokens_consumed_B": 16.024338432, "train/loss_slope": -7.367782724393602e-06} {"step": 7650, "timestamp": 1778333984.4246683, "grad/layer_0/attn": 0.0028215853963047266, "grad/layer_0/mlp": 0.003274079179391265, "grad/layer_0/attn_mlp_ratio": 0.861795074439777, "grad/layer_4/attn": 0.0019345401087775826, "grad/layer_4/mlp": 0.0026478045620024204, "grad/layer_4/attn_mlp_ratio": 0.7306204028338606, "grad/layer_8/attn": 0.005459205247461796, "grad/layer_8/mlp": 0.0033911545760929585, "grad/layer_8/attn_mlp_ratio": 1.6098366983813002, "grad/layer_12/attn": 0.0063969227485358715, "grad/layer_12/mlp": 0.007258519064635038, "grad/layer_12/attn_mlp_ratio": 0.8812985959592092, "grad/layer_16/attn": 0.005487713031470776, "grad/layer_16/mlp": 0.004808509722352028, "grad/layer_16/attn_mlp_ratio": 1.1412502488736778, "grad/layer_20/attn": 0.005594651214778423, "grad/layer_20/mlp": 0.006360054016113281, "grad/layer_20/attn_mlp_ratio": 0.8796546558628017, "grad/layer_24/attn": 0.01101685781031847, "grad/layer_24/mlp": 0.010800747200846672, "grad/layer_24/attn_mlp_ratio": 1.0200088478558198, "grad/layer_27/attn": 0.004961238708347082, "grad/layer_27/mlp": 0.01194879598915577, "grad/layer_27/attn_mlp_ratio": 0.4152082495448806} {"step": 7650, "timestamp": 1778333985.0176232, "eos/sharpness": 52.84173488616943, "eos/L0_probe": 2.329335927963257, "eos/L_plus": 2.6157419681549072, "eos/L_minus": 2.571347236633301, "eos/grad_norm": 0.181928351521492, "eos/embed_grad_frac": 0.07486393302679062, "eos/time_s": 0.5902214050292969} {"step": 7650, "timestamp": 1778333985.0381992, "train/loss": 2.36486234664917, "train/z_loss": 0.001352985855191946, "train/perplexity": 10.642573730739777, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914167.8087227766, "perf/iters_per_sec": 0.9127463382352717, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0955946445465088, "data/tokens_consumed": 16045309952, "data/tokens_consumed_B": 16.045309952, "train/loss_slope": -7.254575860418014e-06} {"step": 7650, "timestamp": 1778333986.403828, "geo/rankme_last": 428.14996337890625, "geo/layer_0/stable_rank_q_proj": 20.730823516845703, "geo/layer_0/stable_rank_k_proj": 17.151426315307617, "geo/layer_0/stable_rank_o_proj": 44.75039291381836, "geo/layer_0/stable_rank_gate_proj": 128.1308135986328, "geo/layer_0/stable_rank_down_proj": 56.868473052978516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06636660546064377, "geo/layer_0/attn_entropy_mean": 6.242537021636963, "geo/layer_0/attn_entropy_std": 0.43482139706611633, "geo/layer_7/stable_rank_q_proj": 42.07868576049805, "geo/layer_7/stable_rank_k_proj": 39.03786849975586, "geo/layer_7/stable_rank_o_proj": 89.544677734375, "geo/layer_7/stable_rank_gate_proj": 78.9047622680664, "geo/layer_7/stable_rank_down_proj": 144.4925994873047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3964625895023346, "geo/layer_7/attn_entropy_mean": 4.710068702697754, "geo/layer_7/attn_entropy_std": 0.7507755160331726, "geo/layer_14/stable_rank_q_proj": 51.65509033203125, "geo/layer_14/stable_rank_k_proj": 42.38274383544922, "geo/layer_14/stable_rank_o_proj": 42.4244499206543, "geo/layer_14/stable_rank_gate_proj": 72.01634979248047, "geo/layer_14/stable_rank_down_proj": 126.83283996582031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3705964982509613, "geo/layer_14/attn_entropy_mean": 5.516513824462891, "geo/layer_14/attn_entropy_std": 0.47084322571754456, "geo/layer_21/stable_rank_q_proj": 39.35079574584961, "geo/layer_21/stable_rank_k_proj": 28.781143188476562, "geo/layer_21/stable_rank_o_proj": 65.6337890625, "geo/layer_21/stable_rank_gate_proj": 61.77056884765625, "geo/layer_21/stable_rank_down_proj": 49.6643180847168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13866037130355835, "geo/layer_21/attn_entropy_mean": 5.856365203857422, "geo/layer_21/attn_entropy_std": 0.32256102561950684, "geo/layer_27/stable_rank_q_proj": 44.29523849487305, "geo/layer_27/stable_rank_k_proj": 30.3522891998291, "geo/layer_27/stable_rank_o_proj": 108.12360382080078, "geo/layer_27/stable_rank_gate_proj": 71.76617431640625, "geo/layer_27/stable_rank_down_proj": 128.1981201171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09881947189569473, "geo/layer_27/attn_entropy_mean": 4.321613311767578, "geo/layer_27/attn_entropy_std": 0.6737680435180664, "attnres/final_alpha/block_0": 0.2528349459171295, "attnres/block_norm/0": 1.7781038284301758, "attnres/final_alpha/block_1": 0.003928790800273418, "attnres/block_norm/1": 50161.234375, "attnres/final_alpha/block_2": 0.008550472557544708, "attnres/block_norm/2": 29756.267578125, "attnres/final_alpha/block_3": 0.01037323847413063, "attnres/block_norm/3": 70781.0703125, "attnres/final_alpha/block_4": 0.011961279436945915, "attnres/block_norm/4": 17025.484375, "attnres/final_alpha/block_5": 0.6131443381309509, "attnres/block_norm/5": 7080.443359375, "attnres/final_alpha/block_6": 0.09920694679021835, "attnres/block_norm/6": 47314.7734375, "geo/tier1_time_s": 1.3616943359375, "geo/step": 7650.0, "geo/rankme_slope": 0.0005196898290566227} {"step": 7660, "timestamp": 1778333996.777873, "train/loss": 2.3611968040466307, "train/z_loss": 0.001357296295464039, "train/perplexity": 10.603634333937826, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786957.4738295572, "perf/iters_per_sec": 0.8520877236507212, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1735880851745606, "data/tokens_consumed": 16066281472, "data/tokens_consumed_B": 16.066281472, "train/loss_slope": -6.591712037185721e-06} {"step": 7670, "timestamp": 1778334007.1289647, "train/loss": 2.335299277305603, "train/z_loss": 0.001347241026815027, "train/perplexity": 10.33255176494056, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026940.2821703313, "perf/iters_per_sec": 0.9665204439975411, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034639263153076, "data/tokens_consumed": 16087252992, "data/tokens_consumed_B": 16.087252992, "train/loss_slope": -7.28176924118647e-06} {"step": 7680, "timestamp": 1778334017.4860206, "train/loss": 2.363531303405762, "train/z_loss": 0.0013606209657154977, "train/perplexity": 10.628417428298395, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025700.427501637, "perf/iters_per_sec": 0.965929235220736, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352725267410279, "data/tokens_consumed": 16108224512, "data/tokens_consumed_B": 16.108224512, "train/loss_slope": -5.347530039945484e-06} {"step": 7690, "timestamp": 1778334027.8412874, "train/loss": 2.378950905799866, "train/z_loss": 0.0013521110638976097, "train/perplexity": 10.793573446855664, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026557.068060955, "perf/iters_per_sec": 0.9663377132706428, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034834909439087, "data/tokens_consumed": 16129196032, "data/tokens_consumed_B": 16.129196032, "train/loss_slope": -3.2058035257947188e-06} {"step": 7700, "timestamp": 1778334038.1794686, "grad/layer_0/attn": 0.0029757844749838114, "grad/layer_0/mlp": 0.003447633935138583, "grad/layer_0/attn_mlp_ratio": 0.863138153485654, "grad/layer_4/attn": 0.002233322709798813, "grad/layer_4/mlp": 0.0027457422111183405, "grad/layer_4/attn_mlp_ratio": 0.8133766598400054, "grad/layer_8/attn": 0.004933384247124195, "grad/layer_8/mlp": 0.003702949732542038, "grad/layer_8/attn_mlp_ratio": 1.3322849269436297, "grad/layer_12/attn": 0.005520028527826071, "grad/layer_12/mlp": 0.007660908158868551, "grad/layer_12/attn_mlp_ratio": 0.7205449199102315, "grad/layer_16/attn": 0.003419368527829647, "grad/layer_16/mlp": 0.004549025092273951, "grad/layer_16/attn_mlp_ratio": 0.751670607064804, "grad/layer_20/attn": 0.005142313428223133, "grad/layer_20/mlp": 0.006583758629858494, "grad/layer_20/attn_mlp_ratio": 0.7810604305564609, "grad/layer_24/attn": 0.008259104564785957, "grad/layer_24/mlp": 0.010083704255521297, "grad/layer_24/attn_mlp_ratio": 0.8190546126299025, "grad/layer_27/attn": 0.006288252770900726, "grad/layer_27/mlp": 0.01116402167826891, "grad/layer_27/attn_mlp_ratio": 0.563260525265276} {"step": 7700, "timestamp": 1778334038.1954715, "train/loss": 2.3193125486373902, "train/z_loss": 0.0013573448872193695, "train/perplexity": 10.168681428813048, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026413.7385823068, "perf/iters_per_sec": 0.9662693684493574, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349081039428711, "data/tokens_consumed": 16150167552, "data/tokens_consumed_B": 16.150167552, "train/loss_slope": -5.516181190987391e-06} {"step": 7710, "timestamp": 1778334048.5483773, "train/loss": 2.3541207790374754, "train/z_loss": 0.0013450428377836943, "train/perplexity": 10.528867589843266, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026646.7643445088, "perf/iters_per_sec": 0.9663804837915939, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347891092300414, "data/tokens_consumed": 16171139072, "data/tokens_consumed_B": 16.171139072, "train/loss_slope": -6.674145021275455e-06} {"step": 7720, "timestamp": 1778334058.9235082, "train/loss": 2.3471231698989867, "train/z_loss": 0.0013552968157455325, "train/perplexity": 10.455447870619919, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022593.5805149865, "perf/iters_per_sec": 0.9644477751326497, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368627786636353, "data/tokens_consumed": 16192110592, "data/tokens_consumed_B": 16.192110592, "train/loss_slope": -6.9268901463281255e-06} {"step": 7725, "timestamp": 1778334064.6939683, "eos/sharpness": 30.679464340209954, "eos/L0_probe": 2.3323304653167725, "eos/L_plus": 2.482414722442627, "eos/L_minus": 2.4890408515930176, "eos/grad_norm": 0.12396300584077835, "eos/embed_grad_frac": 0.18800294399261475, "eos/time_s": 0.6058647632598877} {"step": 7725, "timestamp": 1778334066.078125, "geo/rankme_last": 427.6092834472656, "geo/layer_0/stable_rank_q_proj": 20.723270416259766, "geo/layer_0/stable_rank_k_proj": 17.185131072998047, "geo/layer_0/stable_rank_o_proj": 44.721534729003906, "geo/layer_0/stable_rank_gate_proj": 128.09335327148438, "geo/layer_0/stable_rank_down_proj": 56.87689971923828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06494063138961792, "geo/layer_0/attn_entropy_mean": 6.241109371185303, "geo/layer_0/attn_entropy_std": 0.4383092224597931, "geo/layer_7/stable_rank_q_proj": 42.00350570678711, "geo/layer_7/stable_rank_k_proj": 39.00365447998047, "geo/layer_7/stable_rank_o_proj": 89.44918060302734, "geo/layer_7/stable_rank_gate_proj": 78.89336395263672, "geo/layer_7/stable_rank_down_proj": 144.59683227539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39381685853004456, "geo/layer_7/attn_entropy_mean": 4.72177267074585, "geo/layer_7/attn_entropy_std": 0.7803035378456116, "geo/layer_14/stable_rank_q_proj": 51.647987365722656, "geo/layer_14/stable_rank_k_proj": 42.39208221435547, "geo/layer_14/stable_rank_o_proj": 42.4703254699707, "geo/layer_14/stable_rank_gate_proj": 71.8935546875, "geo/layer_14/stable_rank_down_proj": 127.16211700439453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3798561096191406, "geo/layer_14/attn_entropy_mean": 5.526081085205078, "geo/layer_14/attn_entropy_std": 0.44303613901138306, "geo/layer_21/stable_rank_q_proj": 39.33552169799805, "geo/layer_21/stable_rank_k_proj": 28.688419342041016, "geo/layer_21/stable_rank_o_proj": 65.58876037597656, "geo/layer_21/stable_rank_gate_proj": 61.68104934692383, "geo/layer_21/stable_rank_down_proj": 49.6721305847168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13447779417037964, "geo/layer_21/attn_entropy_mean": 5.868893146514893, "geo/layer_21/attn_entropy_std": 0.3240988552570343, "geo/layer_27/stable_rank_q_proj": 44.38683319091797, "geo/layer_27/stable_rank_k_proj": 30.334680557250977, "geo/layer_27/stable_rank_o_proj": 108.11412811279297, "geo/layer_27/stable_rank_gate_proj": 71.77318572998047, "geo/layer_27/stable_rank_down_proj": 128.4595947265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09849608689546585, "geo/layer_27/attn_entropy_mean": 4.304799556732178, "geo/layer_27/attn_entropy_std": 0.6787376999855042, "attnres/final_alpha/block_0": 0.25615620613098145, "attnres/block_norm/0": 1.7779817581176758, "attnres/final_alpha/block_1": 0.0040590399876236916, "attnres/block_norm/1": 50212.08984375, "attnres/final_alpha/block_2": 0.008673334494233131, "attnres/block_norm/2": 29793.31640625, "attnres/final_alpha/block_3": 0.01054251380264759, "attnres/block_norm/3": 70653.2421875, "attnres/final_alpha/block_4": 0.01221908163279295, "attnres/block_norm/4": 17045.685546875, "attnres/final_alpha/block_5": 0.6058590412139893, "attnres/block_norm/5": 7122.1689453125, "attnres/final_alpha/block_6": 0.10249084234237671, "attnres/block_norm/6": 47241.41015625, "geo/tier1_time_s": 1.3643908500671387, "geo/step": 7725.0, "geo/rankme_slope": 0.000490282206632653} {"step": 7730, "timestamp": 1778334071.256571, "train/loss": 2.3622879505157472, "train/z_loss": 0.00134864398278296, "train/perplexity": 10.61521076674442, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701087.4452406385, "perf/iters_per_sec": 0.81114170324356, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2328302145004273, "data/tokens_consumed": 16213082112, "data/tokens_consumed_B": 16.213082112, "train/loss_slope": -5.387319518466085e-06} {"step": 7740, "timestamp": 1778334081.6094859, "train/loss": 2.2935775995254515, "train/z_loss": 0.0013583738007582724, "train/perplexity": 9.910329524707542, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026975.8276489621, "perf/iters_per_sec": 0.9665373934025584, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346211194992065, "data/tokens_consumed": 16234053632, "data/tokens_consumed_B": 16.234053632, "train/loss_slope": -1.0262175533387355e-05} {"step": 7750, "timestamp": 1778334091.9717836, "grad/layer_0/attn": 0.0026900027878582478, "grad/layer_0/mlp": 0.0030359665397554636, "grad/layer_0/attn_mlp_ratio": 0.8860449099252679, "grad/layer_4/attn": 0.002120893681421876, "grad/layer_4/mlp": 0.002633943222463131, "grad/layer_4/attn_mlp_ratio": 0.8052161424029896, "grad/layer_8/attn": 0.0032176340464502573, "grad/layer_8/mlp": 0.0034709107130765915, "grad/layer_8/attn_mlp_ratio": 0.9270287309970285, "grad/layer_12/attn": 0.005827965214848518, "grad/layer_12/mlp": 0.00738615682348609, "grad/layer_12/attn_mlp_ratio": 0.7890389109276951, "grad/layer_16/attn": 0.007658393122255802, "grad/layer_16/mlp": 0.004051135387271643, "grad/layer_16/attn_mlp_ratio": 1.8904312497861124, "grad/layer_20/attn": 0.004629320465028286, "grad/layer_20/mlp": 0.005553583614528179, "grad/layer_20/attn_mlp_ratio": 0.8335735451178994, "grad/layer_24/attn": 0.0075805108062922955, "grad/layer_24/mlp": 0.008949331939220428, "grad/layer_24/attn_mlp_ratio": 0.8470476649062436, "grad/layer_27/attn": 0.008642095141112804, "grad/layer_27/mlp": 0.009360384196043015, "grad/layer_27/attn_mlp_ratio": 0.9232628562874435} {"step": 7750, "timestamp": 1778334091.9876032, "train/loss": 2.330893039703369, "train/z_loss": 0.001352397119626403, "train/perplexity": 10.287124242553167, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022023.971766628, "perf/iters_per_sec": 0.9641761645157948, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371548652648925, "data/tokens_consumed": 16255025152, "data/tokens_consumed_B": 16.255025152, "train/loss_slope": -1.121836006194305e-05} {"step": 7760, "timestamp": 1778334102.3412044, "train/loss": 2.4120413780212404, "train/z_loss": 0.0013448533485643567, "train/perplexity": 11.156712983286429, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026603.526102624, "perf/iters_per_sec": 0.9663598661912078, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348111867904664, "data/tokens_consumed": 16275996672, "data/tokens_consumed_B": 16.275996672, "train/loss_slope": -7.317708370530318e-06} {"step": 7770, "timestamp": 1778334112.6964219, "train/loss": 2.340514874458313, "train/z_loss": 0.0013622736791148783, "train/perplexity": 10.386582972530517, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026452.0200416658, "perf/iters_per_sec": 0.96628762247165, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348885536193848, "data/tokens_consumed": 16296968192, "data/tokens_consumed_B": 16.296968192, "train/loss_slope": -6.32289144823292e-06} {"step": 7780, "timestamp": 1778334123.0531797, "train/loss": 2.327696514129639, "train/z_loss": 0.0013492402504198253, "train/perplexity": 10.254293686642091, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026267.395361477, "perf/iters_per_sec": 0.9661995865638147, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349828481674195, "data/tokens_consumed": 16317939712, "data/tokens_consumed_B": 16.317939712, "train/loss_slope": -6.143771124930434e-06} {"step": 7790, "timestamp": 1778334133.4025214, "train/loss": 2.3498870372772216, "train/z_loss": 0.0013595383381471038, "train/perplexity": 10.484385313120026, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027338.4066828259, "perf/iters_per_sec": 0.96671028455869, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034436082839966, "data/tokens_consumed": 16338911232, "data/tokens_consumed_B": 16.338911232, "train/loss_slope": -5.60047848962137e-06} {"step": 7800, "timestamp": 1778334143.7400615, "grad/layer_0/attn": 0.00343268527649343, "grad/layer_0/mlp": 0.003550600726157427, "grad/layer_0/attn_mlp_ratio": 0.9667899728983028, "grad/layer_4/attn": 0.0033076005056500435, "grad/layer_4/mlp": 0.0026816301979124546, "grad/layer_4/attn_mlp_ratio": 1.2334289734960417, "grad/layer_8/attn": 0.0038998674135655165, "grad/layer_8/mlp": 0.0035751701798290014, "grad/layer_8/attn_mlp_ratio": 1.090820047248783, "grad/layer_12/attn": 0.006429442670196295, "grad/layer_12/mlp": 0.00688123656436801, "grad/layer_12/attn_mlp_ratio": 0.9343440697932733, "grad/layer_16/attn": 0.004034093115478754, "grad/layer_16/mlp": 0.005251740571111441, "grad/layer_16/attn_mlp_ratio": 0.7681440055997677, "grad/layer_20/attn": 0.003252104390412569, "grad/layer_20/mlp": 0.006853823084384203, "grad/layer_20/attn_mlp_ratio": 0.47449493558896966, "grad/layer_24/attn": 0.015323450788855553, "grad/layer_24/mlp": 0.013029691763222218, "grad/layer_24/attn_mlp_ratio": 1.1760409186734284, "grad/layer_27/attn": 0.007513575721532106, "grad/layer_27/mlp": 0.012495053932070732, "grad/layer_27/attn_mlp_ratio": 0.6013239880553702} {"step": 7800, "timestamp": 1778334144.3434641, "eos/sharpness": 61.607050895690904, "eos/L0_probe": 2.3313753604888916, "eos/L_plus": 2.6092703342437744, "eos/L_minus": 2.669550895690918, "eos/grad_norm": 0.2286168932914734, "eos/embed_grad_frac": 0.05016891285777092, "eos/time_s": 0.6005451679229736} {"step": 7800, "timestamp": 1778334144.364458, "train/loss": 2.343933415412903, "train/z_loss": 0.0013528716284781695, "train/perplexity": 10.422150692014942, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914304.0728886854, "perf/iters_per_sec": 0.9128113140529086, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0955166578292848, "data/tokens_consumed": 16359882752, "data/tokens_consumed_B": 16.359882752, "train/loss_slope": -6.207965352389985e-06} {"step": 7800, "timestamp": 1778334145.726779, "geo/rankme_last": 428.6179504394531, "geo/layer_0/stable_rank_q_proj": 20.70173454284668, "geo/layer_0/stable_rank_k_proj": 17.156272888183594, "geo/layer_0/stable_rank_o_proj": 44.764923095703125, "geo/layer_0/stable_rank_gate_proj": 128.03256225585938, "geo/layer_0/stable_rank_down_proj": 56.87255096435547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06815975904464722, "geo/layer_0/attn_entropy_mean": 6.241145610809326, "geo/layer_0/attn_entropy_std": 0.4350070059299469, "geo/layer_7/stable_rank_q_proj": 42.11979675292969, "geo/layer_7/stable_rank_k_proj": 38.87593078613281, "geo/layer_7/stable_rank_o_proj": 89.36882019042969, "geo/layer_7/stable_rank_gate_proj": 78.83891296386719, "geo/layer_7/stable_rank_down_proj": 144.49566650390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3923218250274658, "geo/layer_7/attn_entropy_mean": 4.718314170837402, "geo/layer_7/attn_entropy_std": 0.7775508761405945, "geo/layer_14/stable_rank_q_proj": 51.5587272644043, "geo/layer_14/stable_rank_k_proj": 42.3217658996582, "geo/layer_14/stable_rank_o_proj": 42.42853546142578, "geo/layer_14/stable_rank_gate_proj": 71.87638092041016, "geo/layer_14/stable_rank_down_proj": 127.49188232421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3686566948890686, "geo/layer_14/attn_entropy_mean": 5.5358662605285645, "geo/layer_14/attn_entropy_std": 0.453682541847229, "geo/layer_21/stable_rank_q_proj": 39.29228973388672, "geo/layer_21/stable_rank_k_proj": 28.75972557067871, "geo/layer_21/stable_rank_o_proj": 65.6216049194336, "geo/layer_21/stable_rank_gate_proj": 61.67290496826172, "geo/layer_21/stable_rank_down_proj": 49.706871032714844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13074873387813568, "geo/layer_21/attn_entropy_mean": 5.859711647033691, "geo/layer_21/attn_entropy_std": 0.32693514227867126, "geo/layer_27/stable_rank_q_proj": 44.3368034362793, "geo/layer_27/stable_rank_k_proj": 30.339847564697266, "geo/layer_27/stable_rank_o_proj": 108.15693664550781, "geo/layer_27/stable_rank_gate_proj": 71.7545166015625, "geo/layer_27/stable_rank_down_proj": 128.2793426513672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09578487277030945, "geo/layer_27/attn_entropy_mean": 4.325716018676758, "geo/layer_27/attn_entropy_std": 0.6887741684913635, "attnres/final_alpha/block_0": 0.25662457942962646, "attnres/block_norm/0": 1.7780731916427612, "attnres/final_alpha/block_1": 0.004053303971886635, "attnres/block_norm/1": 50397.48828125, "attnres/final_alpha/block_2": 0.00882573053240776, "attnres/block_norm/2": 29869.130859375, "attnres/final_alpha/block_3": 0.010683028027415276, "attnres/block_norm/3": 70425.734375, "attnres/final_alpha/block_4": 0.012261339463293552, "attnres/block_norm/4": 17049.921875, "attnres/final_alpha/block_5": 0.6049908399581909, "attnres/block_norm/5": 7127.03759765625, "attnres/final_alpha/block_6": 0.10256115347146988, "attnres/block_norm/6": 46786.76171875, "geo/tier1_time_s": 1.3582448959350586, "geo/step": 7800.0, "geo/rankme_slope": 0.00048436370642006803} {"step": 7810, "timestamp": 1778334156.0724156, "train/loss": 2.3394222497940063, "train/z_loss": 0.0013626037631183862, "train/perplexity": 10.375240533439623, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791821.4771097736, "perf/iters_per_sec": 0.8544070611523502, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1704023122787475, "data/tokens_consumed": 16380854272, "data/tokens_consumed_B": 16.380854272, "train/loss_slope": -6.5939931872367794e-06} {"step": 7820, "timestamp": 1778334166.4186378, "train/loss": 2.359215784072876, "train/z_loss": 0.00135657339124009, "train/perplexity": 10.582649115459319, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027898.157017876, "perf/iters_per_sec": 0.9669771943177585, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341505527496337, "data/tokens_consumed": 16401825792, "data/tokens_consumed_B": 16.401825792, "train/loss_slope": -6.978657994583608e-06} {"step": 7830, "timestamp": 1778334176.7660499, "train/loss": 2.329245090484619, "train/z_loss": 0.0013470383128151298, "train/perplexity": 10.270185545084518, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028021.7774883388, "perf/iters_per_sec": 0.9670361411515898, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340875148773194, "data/tokens_consumed": 16422797312, "data/tokens_consumed_B": 16.422797312, "train/loss_slope": -7.68948102047446e-06} {"step": 7840, "timestamp": 1778334187.121162, "train/loss": 2.3645534038543703, "train/z_loss": 0.0013487640535458922, "train/perplexity": 10.63928629210893, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026476.763705678, "perf/iters_per_sec": 0.9662994211700812, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348759174346924, "data/tokens_consumed": 16443768832, "data/tokens_consumed_B": 16.443768832, "train/loss_slope": -7.504650239575316e-06} {"step": 7850, "timestamp": 1778334197.4681933, "grad/layer_0/attn": 0.0027789887972176075, "grad/layer_0/mlp": 0.003102118382230401, "grad/layer_0/attn_mlp_ratio": 0.8958358016098515, "grad/layer_4/attn": 0.0017969118198379874, "grad/layer_4/mlp": 0.0026373076252639294, "grad/layer_4/attn_mlp_ratio": 0.6813432511589678, "grad/layer_8/attn": 0.0032713438849896193, "grad/layer_8/mlp": 0.003417140571400523, "grad/layer_8/attn_mlp_ratio": 0.9573336890602315, "grad/layer_12/attn": 0.008168249391019344, "grad/layer_12/mlp": 0.007223047781735659, "grad/layer_12/attn_mlp_ratio": 1.1308590950468072, "grad/layer_16/attn": 0.0040501500479876995, "grad/layer_16/mlp": 0.004859239794313908, "grad/layer_16/attn_mlp_ratio": 0.8334945662442035, "grad/layer_20/attn": 0.002686401130631566, "grad/layer_20/mlp": 0.006524622905999422, "grad/layer_20/attn_mlp_ratio": 0.41173277416356585, "grad/layer_24/attn": 0.01327730342745781, "grad/layer_24/mlp": 0.012433790601789951, "grad/layer_24/attn_mlp_ratio": 1.0678403510159156, "grad/layer_27/attn": 0.0058327894657850266, "grad/layer_27/mlp": 0.01391738560050726, "grad/layer_27/attn_mlp_ratio": 0.4191009426125507} {"step": 7850, "timestamp": 1778334197.484409, "train/loss": 2.361265182495117, "train/z_loss": 0.0013467698590829968, "train/perplexity": 10.604359418791706, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024636.6446813156, "perf/iters_per_sec": 0.9654219840437487, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035816478729248, "data/tokens_consumed": 16464740352, "data/tokens_consumed_B": 16.464740352, "train/loss_slope": -8.150556643303162e-06} {"step": 7860, "timestamp": 1778334207.8355262, "train/loss": 2.3432067155838014, "train/z_loss": 0.0013516479753889143, "train/perplexity": 10.414579668152241, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027370.3213428645, "perf/iters_per_sec": 0.9667255026544879, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034419798851013, "data/tokens_consumed": 16485711872, "data/tokens_consumed_B": 16.485711872, "train/loss_slope": -7.485767702708651e-06} {"step": 7870, "timestamp": 1778334218.1841435, "train/loss": 2.32133526802063, "train/z_loss": 0.0013530951924622058, "train/perplexity": 10.189270633913342, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027576.8330101138, "perf/iters_per_sec": 0.9668239750910348, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343144416809082, "data/tokens_consumed": 16506683392, "data/tokens_consumed_B": 16.506683392, "train/loss_slope": -9.490560920181938e-06} {"step": 7875, "timestamp": 1778334223.955373, "eos/sharpness": 35.72344779968261, "eos/L0_probe": 2.3273558616638184, "eos/L_plus": 2.51440167427063, "eos/L_minus": 2.497544527053833, "eos/grad_norm": 0.1338728815317154, "eos/embed_grad_frac": 0.15338869392871857, "eos/time_s": 0.6034715175628662} {"step": 7875, "timestamp": 1778334225.3358836, "geo/rankme_last": 428.47979736328125, "geo/layer_0/stable_rank_q_proj": 20.71592903137207, "geo/layer_0/stable_rank_k_proj": 17.17580223083496, "geo/layer_0/stable_rank_o_proj": 44.68292236328125, "geo/layer_0/stable_rank_gate_proj": 127.93656921386719, "geo/layer_0/stable_rank_down_proj": 56.88298797607422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06540928035974503, "geo/layer_0/attn_entropy_mean": 6.241675853729248, "geo/layer_0/attn_entropy_std": 0.43863821029663086, "geo/layer_7/stable_rank_q_proj": 42.15744400024414, "geo/layer_7/stable_rank_k_proj": 38.86376190185547, "geo/layer_7/stable_rank_o_proj": 89.28470611572266, "geo/layer_7/stable_rank_gate_proj": 79.01297760009766, "geo/layer_7/stable_rank_down_proj": 144.44920349121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4103047847747803, "geo/layer_7/attn_entropy_mean": 4.737874984741211, "geo/layer_7/attn_entropy_std": 0.7762594819068909, "geo/layer_14/stable_rank_q_proj": 51.48223114013672, "geo/layer_14/stable_rank_k_proj": 42.343318939208984, "geo/layer_14/stable_rank_o_proj": 42.40980911254883, "geo/layer_14/stable_rank_gate_proj": 71.87657928466797, "geo/layer_14/stable_rank_down_proj": 127.2708740234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3677288591861725, "geo/layer_14/attn_entropy_mean": 5.513367176055908, "geo/layer_14/attn_entropy_std": 0.47100961208343506, "geo/layer_21/stable_rank_q_proj": 39.320457458496094, "geo/layer_21/stable_rank_k_proj": 28.643672943115234, "geo/layer_21/stable_rank_o_proj": 65.54878997802734, "geo/layer_21/stable_rank_gate_proj": 61.68997573852539, "geo/layer_21/stable_rank_down_proj": 49.78187942504883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13727284967899323, "geo/layer_21/attn_entropy_mean": 5.849212646484375, "geo/layer_21/attn_entropy_std": 0.31749245524406433, "geo/layer_27/stable_rank_q_proj": 44.28116226196289, "geo/layer_27/stable_rank_k_proj": 30.295656204223633, "geo/layer_27/stable_rank_o_proj": 107.95460510253906, "geo/layer_27/stable_rank_gate_proj": 71.75364685058594, "geo/layer_27/stable_rank_down_proj": 128.6326446533203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10414958000183105, "geo/layer_27/attn_entropy_mean": 4.317905426025391, "geo/layer_27/attn_entropy_std": 0.6886865496635437, "attnres/final_alpha/block_0": 0.2533251643180847, "attnres/block_norm/0": 1.7782145738601685, "attnres/final_alpha/block_1": 0.00386800616979599, "attnres/block_norm/1": 50419.1640625, "attnres/final_alpha/block_2": 0.00865880399942398, "attnres/block_norm/2": 29828.146484375, "attnres/final_alpha/block_3": 0.010560600087046623, "attnres/block_norm/3": 70887.109375, "attnres/final_alpha/block_4": 0.011988621205091476, "attnres/block_norm/4": 17000.45703125, "attnres/final_alpha/block_5": 0.6109501123428345, "attnres/block_norm/5": 7071.30322265625, "attnres/final_alpha/block_6": 0.10064871609210968, "attnres/block_norm/6": 47411.0234375, "geo/tier1_time_s": 1.358898639678955, "geo/step": 7875.0, "geo/rankme_slope": 0.000467914431397559} {"step": 7880, "timestamp": 1778334230.5130188, "train/loss": 2.3801167249679565, "train/z_loss": 0.0013637986383400857, "train/perplexity": 10.806164139481565, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701837.7083910257, "perf/iters_per_sec": 0.8114994565920952, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2322867155075072, "data/tokens_consumed": 16527654912, "data/tokens_consumed_B": 16.527654912, "train/loss_slope": -8.706524033750983e-06} {"step": 7890, "timestamp": 1778334240.8640323, "train/loss": 2.3257635116577147, "train/z_loss": 0.0013548758463002742, "train/perplexity": 10.234491256836835, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027392.470614126, "perf/iters_per_sec": 0.9667360642500524, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344084978103638, "data/tokens_consumed": 16548626432, "data/tokens_consumed_B": 16.548626432, "train/loss_slope": -9.686044712450022e-06} {"step": 7900, "timestamp": 1778334251.2002842, "grad/layer_0/attn": 0.0029765968210995197, "grad/layer_0/mlp": 0.0031940662302076817, "grad/layer_0/attn_mlp_ratio": 0.9319145294349532, "grad/layer_4/attn": 0.0019038669997826219, "grad/layer_4/mlp": 0.002595399273559451, "grad/layer_4/attn_mlp_ratio": 0.7335545423868891, "grad/layer_8/attn": 0.0037928440142422915, "grad/layer_8/mlp": 0.0035694269463419914, "grad/layer_8/attn_mlp_ratio": 1.0625918291646446, "grad/layer_12/attn": 0.006612875498831272, "grad/layer_12/mlp": 0.0064033400267362595, "grad/layer_12/attn_mlp_ratio": 1.0327228240180664, "grad/layer_16/attn": 0.0032371573615819216, "grad/layer_16/mlp": 0.004267066717147827, "grad/layer_16/attn_mlp_ratio": 0.7586376075886451, "grad/layer_20/attn": 0.00381747679784894, "grad/layer_20/mlp": 0.005722695961594582, "grad/layer_20/attn_mlp_ratio": 0.6670766290504745, "grad/layer_24/attn": 0.00932325329631567, "grad/layer_24/mlp": 0.010803072713315487, "grad/layer_24/attn_mlp_ratio": 0.8630186482520192, "grad/layer_27/attn": 0.004360701888799667, "grad/layer_27/mlp": 0.012122622691094875, "grad/layer_27/attn_mlp_ratio": 0.35971604197756474} {"step": 7900, "timestamp": 1778334251.2162387, "train/loss": 2.3371713161468506, "train/z_loss": 0.0013552287477068604, "train/perplexity": 10.351912819842841, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026751.272163066, "perf/iters_per_sec": 0.9664303170028048, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347357511520385, "data/tokens_consumed": 16569597952, "data/tokens_consumed_B": 16.569597952, "train/loss_slope": -9.15259302037988e-06} {"step": 7910, "timestamp": 1778334261.575909, "train/loss": 2.3295671939849854, "train/z_loss": 0.0013537701568566262, "train/perplexity": 10.273494140624496, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025826.9061625067, "perf/iters_per_sec": 0.9659895449459585, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352078914642333, "data/tokens_consumed": 16590569472, "data/tokens_consumed_B": 16.590569472, "train/loss_slope": -8.92592778336058e-06} {"step": 7920, "timestamp": 1778334271.9205317, "train/loss": 2.342387008666992, "train/z_loss": 0.001355899719055742, "train/perplexity": 10.406046263085987, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028626.8187182802, "perf/iters_per_sec": 0.9673246472922707, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337790966033935, "data/tokens_consumed": 16611540992, "data/tokens_consumed_B": 16.611540992, "train/loss_slope": -7.423119054268389e-06} {"step": 7930, "timestamp": 1778334282.2626066, "train/loss": 2.350063991546631, "train/z_loss": 0.001360883447341621, "train/perplexity": 10.4862407340208, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028691.665977374, "perf/iters_per_sec": 0.9673555688750143, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337460517883301, "data/tokens_consumed": 16632512512, "data/tokens_consumed_B": 16.632512512, "train/loss_slope": -7.702695976461031e-06} {"step": 7940, "timestamp": 1778334292.6144006, "train/loss": 2.335556411743164, "train/z_loss": 0.0013476261752657591, "train/perplexity": 10.335208961440932, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027104.3809691756, "perf/iters_per_sec": 0.9665986924024466, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345555067062377, "data/tokens_consumed": 16653484032, "data/tokens_consumed_B": 16.653484032, "train/loss_slope": -8.049991929849396e-06} {"step": 7950, "timestamp": 1778334302.949871, "grad/layer_0/attn": 0.003208582056686282, "grad/layer_0/mlp": 0.0034827417694032192, "grad/layer_0/attn_mlp_ratio": 0.9212804672302873, "grad/layer_4/attn": 0.002525696996599436, "grad/layer_4/mlp": 0.0026325879152864218, "grad/layer_4/attn_mlp_ratio": 0.9593969819560405, "grad/layer_8/attn": 0.005636426620185375, "grad/layer_8/mlp": 0.003614221466705203, "grad/layer_8/attn_mlp_ratio": 1.559513304914412, "grad/layer_12/attn": 0.006240693852305412, "grad/layer_12/mlp": 0.006631424184888601, "grad/layer_12/attn_mlp_ratio": 0.9410789574309748, "grad/layer_16/attn": 0.005708461627364159, "grad/layer_16/mlp": 0.004544856492429972, "grad/layer_16/attn_mlp_ratio": 1.2560267879238076, "grad/layer_20/attn": 0.0028482412453740835, "grad/layer_20/mlp": 0.006061153952032328, "grad/layer_20/attn_mlp_ratio": 0.4699173162277665, "grad/layer_24/attn": 0.013191211968660355, "grad/layer_24/mlp": 0.011242535896599293, "grad/layer_24/attn_mlp_ratio": 1.173330641116071, "grad/layer_27/attn": 0.004296105355024338, "grad/layer_27/mlp": 0.012201603502035141, "grad/layer_27/attn_mlp_ratio": 0.35209350304641746} {"step": 7950, "timestamp": 1778334303.5480926, "eos/sharpness": 51.69534683227538, "eos/L0_probe": 2.32910418510437, "eos/L_plus": 2.5847015380859375, "eos/L_minus": 2.5904603004455566, "eos/grad_norm": 0.1881004273891449, "eos/embed_grad_frac": 0.0716482624411583, "eos/time_s": 0.5953986644744873} {"step": 7950, "timestamp": 1778334303.5671272, "train/loss": 2.3505052089691163, "train/z_loss": 0.0013431798899546266, "train/perplexity": 10.490868466972152, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915800.7134012699, "perf/iters_per_sec": 0.9135249678617811, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0946608304977417, "data/tokens_consumed": 16674455552, "data/tokens_consumed_B": 16.674455552, "train/loss_slope": -6.313409728042486e-06} {"step": 7950, "timestamp": 1778334304.931313, "geo/rankme_last": 428.5312805175781, "geo/layer_0/stable_rank_q_proj": 20.73472785949707, "geo/layer_0/stable_rank_k_proj": 17.203269958496094, "geo/layer_0/stable_rank_o_proj": 44.65739822387695, "geo/layer_0/stable_rank_gate_proj": 127.93048858642578, "geo/layer_0/stable_rank_down_proj": 56.8538932800293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06529488414525986, "geo/layer_0/attn_entropy_mean": 6.245203018188477, "geo/layer_0/attn_entropy_std": 0.4343406558036804, "geo/layer_7/stable_rank_q_proj": 42.132564544677734, "geo/layer_7/stable_rank_k_proj": 38.77833557128906, "geo/layer_7/stable_rank_o_proj": 89.15536499023438, "geo/layer_7/stable_rank_gate_proj": 79.15216064453125, "geo/layer_7/stable_rank_down_proj": 144.4916534423828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4035850465297699, "geo/layer_7/attn_entropy_mean": 4.742366790771484, "geo/layer_7/attn_entropy_std": 0.765580952167511, "geo/layer_14/stable_rank_q_proj": 51.48371887207031, "geo/layer_14/stable_rank_k_proj": 42.30797576904297, "geo/layer_14/stable_rank_o_proj": 42.44784927368164, "geo/layer_14/stable_rank_gate_proj": 71.84549713134766, "geo/layer_14/stable_rank_down_proj": 127.33424377441406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37443533539772034, "geo/layer_14/attn_entropy_mean": 5.524881362915039, "geo/layer_14/attn_entropy_std": 0.46754589676856995, "geo/layer_21/stable_rank_q_proj": 39.32011413574219, "geo/layer_21/stable_rank_k_proj": 28.668378829956055, "geo/layer_21/stable_rank_o_proj": 65.48572540283203, "geo/layer_21/stable_rank_gate_proj": 61.702064514160156, "geo/layer_21/stable_rank_down_proj": 49.83917999267578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13299734890460968, "geo/layer_21/attn_entropy_mean": 5.857609748840332, "geo/layer_21/attn_entropy_std": 0.3302696943283081, "geo/layer_27/stable_rank_q_proj": 44.364845275878906, "geo/layer_27/stable_rank_k_proj": 30.269922256469727, "geo/layer_27/stable_rank_o_proj": 107.96271514892578, "geo/layer_27/stable_rank_gate_proj": 71.72785949707031, "geo/layer_27/stable_rank_down_proj": 128.75318908691406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09932978451251984, "geo/layer_27/attn_entropy_mean": 4.313637733459473, "geo/layer_27/attn_entropy_std": 0.6741504073143005, "attnres/final_alpha/block_0": 0.254950612783432, "attnres/block_norm/0": 1.7781016826629639, "attnres/final_alpha/block_1": 0.003989247605204582, "attnres/block_norm/1": 50237.27734375, "attnres/final_alpha/block_2": 0.008655034005641937, "attnres/block_norm/2": 29833.322265625, "attnres/final_alpha/block_3": 0.010666990652680397, "attnres/block_norm/3": 70298.3515625, "attnres/final_alpha/block_4": 0.012282513082027435, "attnres/block_norm/4": 17026.34375, "attnres/final_alpha/block_5": 0.6084984540939331, "attnres/block_norm/5": 7075.423828125, "attnres/final_alpha/block_6": 0.10095717757940292, "attnres/block_norm/6": 47090.890625, "geo/tier1_time_s": 1.3601696491241455, "geo/step": 7950.0, "geo/rankme_slope": 0.00045384423300570226} {"step": 7960, "timestamp": 1778334315.2754478, "train/loss": 2.3493518114089964, "train/z_loss": 0.0013526547932997345, "train/perplexity": 10.47877530033391, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791725.3762599796, "perf/iters_per_sec": 0.8543612366962335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.170465087890625, "data/tokens_consumed": 16695427072, "data/tokens_consumed_B": 16.695427072, "train/loss_slope": -4.559143033787328e-06} {"step": 7970, "timestamp": 1778334325.639929, "train/loss": 2.3432576417922975, "train/z_loss": 0.0013391584274359048, "train/perplexity": 10.415110056713045, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024681.2903629735, "perf/iters_per_sec": 0.9654432727637164, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357936382293702, "data/tokens_consumed": 16716398592, "data/tokens_consumed_B": 16.716398592, "train/loss_slope": -3.842304878108964e-06} {"step": 7980, "timestamp": 1778334335.9883118, "train/loss": 2.3911710739135743, "train/z_loss": 0.0013505724957212806, "train/perplexity": 10.926281937486502, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027886.8430377773, "perf/iters_per_sec": 0.9669717993916404, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034156322479248, "data/tokens_consumed": 16737370112, "data/tokens_consumed_B": 16.737370112, "train/loss_slope": 7.520389528271729e-07} {"step": 7990, "timestamp": 1778334346.3346457, "train/loss": 2.3529967308044433, "train/z_loss": 0.0013490490382537246, "train/perplexity": 10.517039283871695, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027989.608532517, "perf/iters_per_sec": 0.9670208017981133, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341039180755616, "data/tokens_consumed": 16758341632, "data/tokens_consumed_B": 16.758341632, "train/loss_slope": 3.7313676331088405e-06} {"step": 8000, "timestamp": 1778334356.6832583, "grad/layer_0/attn": 0.0034535371232777834, "grad/layer_0/mlp": 0.0035356308799237013, "grad/layer_0/attn_mlp_ratio": 0.9767809884255259, "grad/layer_4/attn": 0.0025062442291527987, "grad/layer_4/mlp": 0.0026652072556316853, "grad/layer_4/attn_mlp_ratio": 0.9403561879930312, "grad/layer_8/attn": 0.008748007006943226, "grad/layer_8/mlp": 0.003732087090611458, "grad/layer_8/attn_mlp_ratio": 2.3439985617029406, "grad/layer_12/attn": 0.00559860747307539, "grad/layer_12/mlp": 0.007860744372010231, "grad/layer_12/attn_mlp_ratio": 0.7122235677562556, "grad/layer_16/attn": 0.004726317245513201, "grad/layer_16/mlp": 0.005568350199609995, "grad/layer_16/attn_mlp_ratio": 0.84878231275145, "grad/layer_20/attn": 0.004139110911637545, "grad/layer_20/mlp": 0.008049429394304752, "grad/layer_20/attn_mlp_ratio": 0.5142117108505774, "grad/layer_24/attn": 0.020027173683047295, "grad/layer_24/mlp": 0.015484130010008812, "grad/layer_24/attn_mlp_ratio": 1.2933999870035902, "grad/layer_27/attn": 0.01311112567782402, "grad/layer_27/mlp": 0.016682405024766922, "grad/layer_27/attn_mlp_ratio": 0.7859253854445164} {"step": 8000, "timestamp": 1778334356.6990557, "train/loss": 2.308514213562012, "train/z_loss": 0.0013537625200115144, "train/perplexity": 10.059467325940577, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024773.8036007215, "perf/iters_per_sec": 0.9654873865131004, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357463121414185, "data/tokens_consumed": 16779313152, "data/tokens_consumed_B": 16.779313152, "train/loss_slope": 4.1876045247653647e-07} {"step": 8000, "timestamp": 1778334363.7436779, "geo/ww_alpha_mean": 8.119201215639448, "geo/ww_alpha_std": 6.208016660038356, "geo/ww_alpha_min": 1.3526119289665623, "geo/ww_alpha_max": 63.180125235196186, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.8988163422686357, "geo/ww_alpha_by_type/k_proj": 4.331106417779153, "geo/ww_alpha_by_type/v_proj": 12.296020551208063, "geo/ww_alpha_by_type/o_proj": 8.825715822542554, "geo/ww_alpha_by_type/gate_proj": 7.752663156708382, "geo/ww_alpha_by_type/up_proj": 11.674452274304086, "geo/ww_alpha_by_type/down_proj": 8.167899451568982, "geo/twonn_id/layer_0": 0.7178983092308044, "geo/twonn_id/layer_7": 3.6659841537475586, "geo/twonn_id/layer_14": 5.222841739654541, "geo/twonn_id/layer_21": 7.3619842529296875, "geo/twonn_id/layer_27": 6.680896282196045, "geo/tier2_time_s": 7.036991357803345} {"step": 8000, "timestamp": 1778334364.5006049, "eoc/jacobian_sigma/layer_0/attn": 1599.64453125, "eoc/jacobian_sigma/layer_0/mlp": 10888.025390625, "eoc/jacobian_sigma/layer_0": 10888.025390625, "eoc/jacobian_sigma/layer_7/attn": 1.1460472345352173, "eoc/jacobian_sigma/layer_7/mlp": 1.9582421779632568, "eoc/jacobian_sigma/layer_7": 1.9582421779632568, "eoc/jacobian_sigma/layer_14/attn": 2.2855751514434814, "eoc/jacobian_sigma/layer_14/mlp": 12.990507125854492, "eoc/jacobian_sigma/layer_14": 12.990507125854492, "eoc/jacobian_sigma/layer_21/attn": 1.1009714603424072, "eoc/jacobian_sigma/layer_21/mlp": 4.974799156188965, "eoc/jacobian_sigma/layer_21": 4.974799156188965, "eoc/jacobian_sigma/layer_27/attn": 3.7344677448272705, "eoc/jacobian_sigma/layer_27/mlp": 36.31087112426758, "eoc/jacobian_sigma/layer_27": 36.31087112426758, "eoc/layer0_sigma": 10888.025390625, "eoc/sigma_max": 36.31087112426758, "eoc/sigma_min": 1.9582421779632568, "eoc/sigma_mean": 14.058604896068573, "eoc/time_s": 0.750767707824707} {"step": 8010, "timestamp": 1778334374.8740582, "train/loss": 2.3287273168563845, "train/z_loss": 0.0013548558810725809, "train/perplexity": 10.264869290279234, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1154291.6245490718, "perf/iters_per_sec": 0.5504091379876479, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.81683030128479, "data/tokens_consumed": 16800284672, "data/tokens_consumed_B": 16.800284672, "train/loss_slope": -2.944251863655275e-06} {"step": 8020, "timestamp": 1778334385.2297444, "train/loss": 2.34705228805542, "train/z_loss": 0.0013474767445586621, "train/perplexity": 10.454706795464231, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026573.7366811382, "perf/iters_per_sec": 0.9663456614881221, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034826397895813, "data/tokens_consumed": 16821256192, "data/tokens_consumed_B": 16.821256192, "train/loss_slope": -1.7255624278400352e-06} {"step": 8025, "timestamp": 1778334391.0064468, "eos/sharpness": 7.12008476257324, "eos/L0_probe": 2.329749822616577, "eos/L_plus": 2.376286506652832, "eos/L_minus": 2.3544139862060547, "eos/grad_norm": 0.08744406700134277, "eos/embed_grad_frac": 0.2875410318374634, "eos/time_s": 0.6102373600006104} {"step": 8025, "timestamp": 1778334392.384451, "geo/rankme_last": 428.0624084472656, "geo/layer_0/stable_rank_q_proj": 20.721481323242188, "geo/layer_0/stable_rank_k_proj": 17.16172981262207, "geo/layer_0/stable_rank_o_proj": 44.643157958984375, "geo/layer_0/stable_rank_gate_proj": 127.76832580566406, "geo/layer_0/stable_rank_down_proj": 56.97068405151367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06508664786815643, "geo/layer_0/attn_entropy_mean": 6.246817111968994, "geo/layer_0/attn_entropy_std": 0.4370822310447693, "geo/layer_7/stable_rank_q_proj": 42.11671829223633, "geo/layer_7/stable_rank_k_proj": 38.74907302856445, "geo/layer_7/stable_rank_o_proj": 89.37348937988281, "geo/layer_7/stable_rank_gate_proj": 79.23506927490234, "geo/layer_7/stable_rank_down_proj": 144.54022216796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3902129530906677, "geo/layer_7/attn_entropy_mean": 4.727139949798584, "geo/layer_7/attn_entropy_std": 0.7724517583847046, "geo/layer_14/stable_rank_q_proj": 51.43336486816406, "geo/layer_14/stable_rank_k_proj": 42.347354888916016, "geo/layer_14/stable_rank_o_proj": 42.4295654296875, "geo/layer_14/stable_rank_gate_proj": 71.94146728515625, "geo/layer_14/stable_rank_down_proj": 127.3056640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3727954924106598, "geo/layer_14/attn_entropy_mean": 5.491267204284668, "geo/layer_14/attn_entropy_std": 0.44631195068359375, "geo/layer_21/stable_rank_q_proj": 39.321346282958984, "geo/layer_21/stable_rank_k_proj": 28.79684066772461, "geo/layer_21/stable_rank_o_proj": 65.60594177246094, "geo/layer_21/stable_rank_gate_proj": 61.606475830078125, "geo/layer_21/stable_rank_down_proj": 49.86307907104492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1289392113685608, "geo/layer_21/attn_entropy_mean": 5.853838920593262, "geo/layer_21/attn_entropy_std": 0.3203122913837433, "geo/layer_27/stable_rank_q_proj": 44.380699157714844, "geo/layer_27/stable_rank_k_proj": 30.169919967651367, "geo/layer_27/stable_rank_o_proj": 108.03981018066406, "geo/layer_27/stable_rank_gate_proj": 71.67410278320312, "geo/layer_27/stable_rank_down_proj": 128.77151489257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10724131017923355, "geo/layer_27/attn_entropy_mean": 4.312431335449219, "geo/layer_27/attn_entropy_std": 0.6641622185707092, "attnres/final_alpha/block_0": 0.2530341148376465, "attnres/block_norm/0": 1.7780711650848389, "attnres/final_alpha/block_1": 0.003949091769754887, "attnres/block_norm/1": 50223.265625, "attnres/final_alpha/block_2": 0.00866013951599598, "attnres/block_norm/2": 29731.03125, "attnres/final_alpha/block_3": 0.010584209114313126, "attnres/block_norm/3": 70558.9375, "attnres/final_alpha/block_4": 0.011962849646806717, "attnres/block_norm/4": 16985.37109375, "attnres/final_alpha/block_5": 0.6113595366477966, "attnres/block_norm/5": 7105.9482421875, "attnres/final_alpha/block_6": 0.10045003890991211, "attnres/block_norm/6": 47461.6484375, "geo/tier1_time_s": 1.3583025932312012, "geo/step": 8025.0, "geo/rankme_slope": 0.00045340370523209284} {"step": 8030, "timestamp": 1778334397.5598476, "train/loss": 2.377724766731262, "train/z_loss": 0.0013421010109595955, "train/perplexity": 10.780347135068494, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701708.3826587845, "perf/iters_per_sec": 0.8114377892774508, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2323803663253785, "data/tokens_consumed": 16842227712, "data/tokens_consumed_B": 16.842227712, "train/loss_slope": -1.6944638036801411e-06} {"step": 8040, "timestamp": 1778334407.9035351, "train/loss": 2.3405361652374266, "train/z_loss": 0.001349117502104491, "train/perplexity": 10.38680411332845, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028397.4536381988, "perf/iters_per_sec": 0.9672152774992937, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033895993232727, "data/tokens_consumed": 16863199232, "data/tokens_consumed_B": 16.863199232, "train/loss_slope": -4.059099256902894e-06} {"step": 8050, "timestamp": 1778334418.259792, "grad/layer_0/attn": 0.0033995152916759253, "grad/layer_0/mlp": 0.0034117738250643015, "grad/layer_0/attn_mlp_ratio": 0.9964069619917313, "grad/layer_4/attn": 0.0031007721554487944, "grad/layer_4/mlp": 0.0027136553544551134, "grad/layer_4/attn_mlp_ratio": 1.1426550671192013, "grad/layer_8/attn": 0.005061397794634104, "grad/layer_8/mlp": 0.003308526473119855, "grad/layer_8/attn_mlp_ratio": 1.5298041840605006, "grad/layer_12/attn": 0.006954862270504236, "grad/layer_12/mlp": 0.0068483976647257805, "grad/layer_12/attn_mlp_ratio": 1.0155458998492795, "grad/layer_16/attn": 0.006633996032178402, "grad/layer_16/mlp": 0.004533007740974426, "grad/layer_16/attn_mlp_ratio": 1.46348655570654, "grad/layer_20/attn": 0.004765090998262167, "grad/layer_20/mlp": 0.005166127346456051, "grad/layer_20/attn_mlp_ratio": 0.9223719406169522, "grad/layer_24/attn": 0.00875815562903881, "grad/layer_24/mlp": 0.010600920766592026, "grad/layer_24/attn_mlp_ratio": 0.8261693242744085, "grad/layer_27/attn": 0.0045820437371730804, "grad/layer_27/mlp": 0.010919363237917423, "grad/layer_27/attn_mlp_ratio": 0.41962554000396357} {"step": 8050, "timestamp": 1778334418.2755036, "train/loss": 2.3497076988220216, "train/z_loss": 0.0013605386135168373, "train/perplexity": 10.482505228245047, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023283.1567502266, "perf/iters_per_sec": 0.964776590705026, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036509394645691, "data/tokens_consumed": 16884170752, "data/tokens_consumed_B": 16.884170752, "train/loss_slope": -5.40408544486037e-06} {"step": 8060, "timestamp": 1778334428.6296446, "train/loss": 2.3066680908203123, "train/z_loss": 0.0013574584503658117, "train/perplexity": 10.040913446180024, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026658.8583082296, "perf/iters_per_sec": 0.966386250642886, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347829341888428, "data/tokens_consumed": 16905142272, "data/tokens_consumed_B": 16.905142272, "train/loss_slope": -7.135281718746493e-06} {"step": 8070, "timestamp": 1778334438.971664, "train/loss": 2.2891085624694822, "train/z_loss": 0.0013693682383745908, "train/perplexity": 9.866138713560325, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028721.3773258002, "perf/iters_per_sec": 0.9673697363499643, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337309122085572, "data/tokens_consumed": 16926113792, "data/tokens_consumed_B": 16.926113792, "train/loss_slope": -1.2049362518534859e-05} {"step": 8080, "timestamp": 1778334449.3291354, "train/loss": 2.3264118671417235, "train/z_loss": 0.0013528009876608849, "train/perplexity": 10.241128996944324, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025935.6223172643, "perf/iters_per_sec": 0.9660413848482439, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351523399353026, "data/tokens_consumed": 16947085312, "data/tokens_consumed_B": 16.947085312, "train/loss_slope": -1.3468438585420422e-05} {"step": 8090, "timestamp": 1778334459.6904333, "train/loss": 2.375041675567627, "train/z_loss": 0.0013505196664482355, "train/perplexity": 10.751461249990045, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025283.5003602002, "perf/iters_per_sec": 0.9657304288674355, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354856491088866, "data/tokens_consumed": 16968056832, "data/tokens_consumed_B": 16.968056832, "train/loss_slope": -8.607364337507943e-06} {"step": 8100, "timestamp": 1778334470.0337996, "grad/layer_0/attn": 0.0026785305235534906, "grad/layer_0/mlp": 0.003103801980614662, "grad/layer_0/attn_mlp_ratio": 0.862983674211291, "grad/layer_4/attn": 0.0018956182757392526, "grad/layer_4/mlp": 0.002620493760332465, "grad/layer_4/attn_mlp_ratio": 0.723382071003499, "grad/layer_8/attn": 0.004479499999433756, "grad/layer_8/mlp": 0.003517232835292816, "grad/layer_8/attn_mlp_ratio": 1.2735863907349738, "grad/layer_12/attn": 0.006875169463455677, "grad/layer_12/mlp": 0.006472542881965637, "grad/layer_12/attn_mlp_ratio": 1.0622053005459946, "grad/layer_16/attn": 0.0033248234540224075, "grad/layer_16/mlp": 0.004527078475803137, "grad/layer_16/attn_mlp_ratio": 0.7344302508450629, "grad/layer_20/attn": 0.0030389628373086452, "grad/layer_20/mlp": 0.00584873603656888, "grad/layer_20/attn_mlp_ratio": 0.5195930823939395, "grad/layer_24/attn": 0.007115121465176344, "grad/layer_24/mlp": 0.009158061817288399, "grad/layer_24/attn_mlp_ratio": 0.7769243677796681, "grad/layer_27/attn": 0.0041664293967187405, "grad/layer_27/mlp": 0.009327158331871033, "grad/layer_27/attn_mlp_ratio": 0.4466986839723867} {"step": 8100, "timestamp": 1778334470.6434276, "eos/sharpness": 30.462670326232903, "eos/L0_probe": 2.3269529342651367, "eos/L_plus": 2.4777541160583496, "eos/L_minus": 2.480778455734253, "eos/grad_norm": 0.10978473722934723, "eos/embed_grad_frac": 0.1984144002199173, "eos/time_s": 0.6067230701446533} {"step": 8100, "timestamp": 1778334470.665093, "train/loss": 2.366026830673218, "train/z_loss": 0.0013402944430708885, "train/perplexity": 10.654974056413288, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912118.5234453445, "perf/iters_per_sec": 0.9117691628672335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967688322067262, "data/tokens_consumed": 16989028352, "data/tokens_consumed_B": 16.989028352, "train/loss_slope": -7.570946570670226e-06} {"step": 8100, "timestamp": 1778334472.0329027, "geo/rankme_last": 429.061279296875, "geo/layer_0/stable_rank_q_proj": 20.7533016204834, "geo/layer_0/stable_rank_k_proj": 17.193241119384766, "geo/layer_0/stable_rank_o_proj": 44.67763900756836, "geo/layer_0/stable_rank_gate_proj": 127.85250091552734, "geo/layer_0/stable_rank_down_proj": 56.969482421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06332578510046005, "geo/layer_0/attn_entropy_mean": 6.246212959289551, "geo/layer_0/attn_entropy_std": 0.4325425624847412, "geo/layer_7/stable_rank_q_proj": 42.13533401489258, "geo/layer_7/stable_rank_k_proj": 38.75970458984375, "geo/layer_7/stable_rank_o_proj": 89.36045837402344, "geo/layer_7/stable_rank_gate_proj": 79.18480682373047, "geo/layer_7/stable_rank_down_proj": 144.55902099609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41733190417289734, "geo/layer_7/attn_entropy_mean": 4.742625713348389, "geo/layer_7/attn_entropy_std": 0.7602581977844238, "geo/layer_14/stable_rank_q_proj": 51.393619537353516, "geo/layer_14/stable_rank_k_proj": 42.281837463378906, "geo/layer_14/stable_rank_o_proj": 42.401615142822266, "geo/layer_14/stable_rank_gate_proj": 72.01471710205078, "geo/layer_14/stable_rank_down_proj": 127.31752014160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3790034055709839, "geo/layer_14/attn_entropy_mean": 5.504728317260742, "geo/layer_14/attn_entropy_std": 0.46697887778282166, "geo/layer_21/stable_rank_q_proj": 39.266021728515625, "geo/layer_21/stable_rank_k_proj": 28.780317306518555, "geo/layer_21/stable_rank_o_proj": 65.54186248779297, "geo/layer_21/stable_rank_gate_proj": 61.70737075805664, "geo/layer_21/stable_rank_down_proj": 49.835227966308594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13198918104171753, "geo/layer_21/attn_entropy_mean": 5.876852035522461, "geo/layer_21/attn_entropy_std": 0.32379150390625, "geo/layer_27/stable_rank_q_proj": 44.4627571105957, "geo/layer_27/stable_rank_k_proj": 30.210737228393555, "geo/layer_27/stable_rank_o_proj": 107.85620880126953, "geo/layer_27/stable_rank_gate_proj": 71.58432006835938, "geo/layer_27/stable_rank_down_proj": 128.806884765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09570418298244476, "geo/layer_27/attn_entropy_mean": 4.333096504211426, "geo/layer_27/attn_entropy_std": 0.687049925327301, "attnres/final_alpha/block_0": 0.25493699312210083, "attnres/block_norm/0": 1.7781507968902588, "attnres/final_alpha/block_1": 0.003965772222727537, "attnres/block_norm/1": 50286.3359375, "attnres/final_alpha/block_2": 0.008821146562695503, "attnres/block_norm/2": 29798.87890625, "attnres/final_alpha/block_3": 0.010865634307265282, "attnres/block_norm/3": 70302.03125, "attnres/final_alpha/block_4": 0.012215696275234222, "attnres/block_norm/4": 17136.798828125, "attnres/final_alpha/block_5": 0.6082888245582581, "attnres/block_norm/5": 7130.984375, "attnres/final_alpha/block_6": 0.10090593248605728, "attnres/block_norm/6": 47078.3515625, "geo/tier1_time_s": 1.363509178161621, "geo/step": 8100.0, "geo/rankme_slope": 0.0004730625844087635} {"step": 8110, "timestamp": 1778334482.381889, "train/loss": 2.3686971426010133, "train/z_loss": 0.0013561458792537451, "train/perplexity": 10.683464182559089, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790494.3509967315, "perf/iters_per_sec": 0.8537742381080301, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1712698221206665, "data/tokens_consumed": 17009999872, "data/tokens_consumed_B": 17.009999872, "train/loss_slope": -5.01047884634652e-06} {"step": 8120, "timestamp": 1778334493.301371, "train/loss": 2.3382919073104858, "train/z_loss": 0.0013550918665714562, "train/perplexity": 10.363519583879512, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921727.9109823797, "perf/iters_per_sec": 0.9163512759124659, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0912845611572266, "data/tokens_consumed": 17030971392, "data/tokens_consumed_B": 17.030971392, "train/loss_slope": -5.456838642123779e-06} {"step": 8130, "timestamp": 1778334503.6609454, "train/loss": 2.3157979249954224, "train/z_loss": 0.0013530547264963388, "train/perplexity": 10.133005071864076, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025726.4124221131, "perf/iters_per_sec": 0.965941625796372, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352592468261719, "data/tokens_consumed": 17051942912, "data/tokens_consumed_B": 17.051942912, "train/loss_slope": -5.765426777663096e-06} {"step": 8140, "timestamp": 1778334514.019059, "train/loss": 2.372585153579712, "train/z_loss": 0.0013555626501329243, "train/perplexity": 10.725082462328412, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026142.0284761162, "perf/iters_per_sec": 0.9661398069744664, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035046887397766, "data/tokens_consumed": 17072914432, "data/tokens_consumed_B": 17.072914432, "train/loss_slope": -4.607290968392785e-06} {"step": 8150, "timestamp": 1778334524.3572345, "grad/layer_0/attn": 0.00292556406930089, "grad/layer_0/mlp": 0.003187372349202633, "grad/layer_0/attn_mlp_ratio": 0.9178607507989102, "grad/layer_4/attn": 0.0027721491642296314, "grad/layer_4/mlp": 0.002787495031952858, "grad/layer_4/attn_mlp_ratio": 0.994494710484938, "grad/layer_8/attn": 0.004700671415776014, "grad/layer_8/mlp": 0.003617093665525317, "grad/layer_8/attn_mlp_ratio": 1.2995713466369947, "grad/layer_12/attn": 0.005394814535975456, "grad/layer_12/mlp": 0.006724240258336067, "grad/layer_12/attn_mlp_ratio": 0.8022935303446558, "grad/layer_16/attn": 0.0034327248577028513, "grad/layer_16/mlp": 0.004182297736406326, "grad/layer_16/attn_mlp_ratio": 0.8207748448284709, "grad/layer_20/attn": 0.002502014394849539, "grad/layer_20/mlp": 0.005362323950976133, "grad/layer_20/attn_mlp_ratio": 0.4665914202619074, "grad/layer_24/attn": 0.006988844368606806, "grad/layer_24/mlp": 0.007229197770357132, "grad/layer_24/attn_mlp_ratio": 0.9667523968688311, "grad/layer_27/attn": 0.004450176376849413, "grad/layer_27/mlp": 0.006166172679513693, "grad/layer_27/attn_mlp_ratio": 0.7217080247304366} {"step": 8150, "timestamp": 1778334524.372689, "train/loss": 2.3460342407226564, "train/z_loss": 0.001345630525611341, "train/perplexity": 10.444068824993797, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026726.5217950405, "perf/iters_per_sec": 0.966418515107651, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034748387336731, "data/tokens_consumed": 17093885952, "data/tokens_consumed_B": 17.093885952, "train/loss_slope": -3.816589191801837e-06} {"step": 8160, "timestamp": 1778334534.7156456, "train/loss": 2.344672155380249, "train/z_loss": 0.0013549887691624463, "train/perplexity": 10.429852795852531, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028881.0380878253, "perf/iters_per_sec": 0.9674458685340048, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336495637893677, "data/tokens_consumed": 17114857472, "data/tokens_consumed_B": 17.114857472, "train/loss_slope": -3.110022849590203e-06} {"step": 8170, "timestamp": 1778334545.060854, "train/loss": 2.4131746768951414, "train/z_loss": 0.0013477528817020356, "train/perplexity": 11.169364040907453, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028000.2690617905, "perf/iters_per_sec": 0.9670258851345971, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034098482131958, "data/tokens_consumed": 17135828992, "data/tokens_consumed_B": 17.135828992, "train/loss_slope": 7.73403503642337e-07} {"step": 8175, "timestamp": 1778334550.815146, "eos/sharpness": 21.22225761413574, "eos/L0_probe": 2.332305431365967, "eos/L_plus": 2.464569568634033, "eos/L_minus": 2.412263870239258, "eos/grad_norm": 0.10696500539779663, "eos/embed_grad_frac": 0.22912560403347015, "eos/time_s": 0.5911884307861328} {"step": 8175, "timestamp": 1778334552.1953616, "geo/rankme_last": 427.9627685546875, "geo/layer_0/stable_rank_q_proj": 20.778841018676758, "geo/layer_0/stable_rank_k_proj": 17.18793296813965, "geo/layer_0/stable_rank_o_proj": 44.713111877441406, "geo/layer_0/stable_rank_gate_proj": 127.54097747802734, "geo/layer_0/stable_rank_down_proj": 56.98619842529297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06354033201932907, "geo/layer_0/attn_entropy_mean": 6.2456889152526855, "geo/layer_0/attn_entropy_std": 0.4365195333957672, "geo/layer_7/stable_rank_q_proj": 42.113853454589844, "geo/layer_7/stable_rank_k_proj": 38.76803970336914, "geo/layer_7/stable_rank_o_proj": 89.21255493164062, "geo/layer_7/stable_rank_gate_proj": 79.20233154296875, "geo/layer_7/stable_rank_down_proj": 144.46376037597656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40613535046577454, "geo/layer_7/attn_entropy_mean": 4.721758842468262, "geo/layer_7/attn_entropy_std": 0.7666341066360474, "geo/layer_14/stable_rank_q_proj": 51.392181396484375, "geo/layer_14/stable_rank_k_proj": 42.31898880004883, "geo/layer_14/stable_rank_o_proj": 42.40216827392578, "geo/layer_14/stable_rank_gate_proj": 71.97492980957031, "geo/layer_14/stable_rank_down_proj": 127.37556457519531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37511083483695984, "geo/layer_14/attn_entropy_mean": 5.49495792388916, "geo/layer_14/attn_entropy_std": 0.4590744376182556, "geo/layer_21/stable_rank_q_proj": 39.23183822631836, "geo/layer_21/stable_rank_k_proj": 28.669448852539062, "geo/layer_21/stable_rank_o_proj": 65.56210327148438, "geo/layer_21/stable_rank_gate_proj": 61.708377838134766, "geo/layer_21/stable_rank_down_proj": 49.82011413574219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13622663915157318, "geo/layer_21/attn_entropy_mean": 5.863507270812988, "geo/layer_21/attn_entropy_std": 0.3331572413444519, "geo/layer_27/stable_rank_q_proj": 44.431453704833984, "geo/layer_27/stable_rank_k_proj": 30.145639419555664, "geo/layer_27/stable_rank_o_proj": 107.80802154541016, "geo/layer_27/stable_rank_gate_proj": 71.54635620117188, "geo/layer_27/stable_rank_down_proj": 128.72682189941406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10223585367202759, "geo/layer_27/attn_entropy_mean": 4.323141098022461, "geo/layer_27/attn_entropy_std": 0.6824129223823547, "attnres/final_alpha/block_0": 0.25363561511039734, "attnres/block_norm/0": 1.7782907485961914, "attnres/final_alpha/block_1": 0.003958388231694698, "attnres/block_norm/1": 50241.5625, "attnres/final_alpha/block_2": 0.008575666695833206, "attnres/block_norm/2": 29824.134765625, "attnres/final_alpha/block_3": 0.010440044105052948, "attnres/block_norm/3": 70941.71875, "attnres/final_alpha/block_4": 0.011959724128246307, "attnres/block_norm/4": 17147.46875, "attnres/final_alpha/block_5": 0.6091579794883728, "attnres/block_norm/5": 7105.8037109375, "attnres/final_alpha/block_6": 0.10227258503437042, "attnres/block_norm/6": 47354.16796875, "geo/tier1_time_s": 1.3610339164733887, "geo/step": 8175.0, "geo/rankme_slope": 0.0004767734632915666} {"step": 8180, "timestamp": 1778334557.3744624, "train/loss": 2.3594503879547117, "train/z_loss": 0.0013369363034144044, "train/perplexity": 10.585132137273801, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704106.7690659459, "perf/iters_per_sec": 0.8125814290361146, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2306458950042725, "data/tokens_consumed": 17156800512, "data/tokens_consumed_B": 17.156800512, "train/loss_slope": 6.416484038464579e-07} {"step": 8190, "timestamp": 1778334567.7194412, "train/loss": 2.3509240627288817, "train/z_loss": 0.001345861831214279, "train/perplexity": 10.495263527052211, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028184.6017241876, "perf/iters_per_sec": 0.9671137817974985, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340044975280762, "data/tokens_consumed": 17177772032, "data/tokens_consumed_B": 17.177772032, "train/loss_slope": 3.670674117163006e-06} {"step": 8200, "timestamp": 1778334578.0584862, "grad/layer_0/attn": 0.00342166586779058, "grad/layer_0/mlp": 0.003386683762073517, "grad/layer_0/attn_mlp_ratio": 1.0103292799510506, "grad/layer_4/attn": 0.004310589283704758, "grad/layer_4/mlp": 0.0026000516954809427, "grad/layer_4/attn_mlp_ratio": 1.6578859279637579, "grad/layer_8/attn": 0.005009878892451525, "grad/layer_8/mlp": 0.0035704723559319973, "grad/layer_8/attn_mlp_ratio": 1.4031417282404999, "grad/layer_12/attn": 0.007377246860414743, "grad/layer_12/mlp": 0.006942171603441238, "grad/layer_12/attn_mlp_ratio": 1.0626713333462838, "grad/layer_16/attn": 0.008505539037287235, "grad/layer_16/mlp": 0.005194263067096472, "grad/layer_16/attn_mlp_ratio": 1.6374871206307642, "grad/layer_20/attn": 0.0034054231364279985, "grad/layer_20/mlp": 0.006792642176151276, "grad/layer_20/attn_mlp_ratio": 0.5013399790512033, "grad/layer_24/attn": 0.012804735451936722, "grad/layer_24/mlp": 0.012669937685132027, "grad/layer_24/attn_mlp_ratio": 1.0106391735374485, "grad/layer_27/attn": 0.005840640515089035, "grad/layer_27/mlp": 0.013650300912559032, "grad/layer_27/attn_mlp_ratio": 0.42787631640616003} {"step": 8200, "timestamp": 1778334578.0740309, "train/loss": 2.3582850217819216, "train/z_loss": 0.0013529602205380798, "train/perplexity": 10.572803767274431, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026683.3735423891, "perf/iters_per_sec": 0.9663979404174753, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03477041721344, "data/tokens_consumed": 17198743552, "data/tokens_consumed_B": 17.198743552, "train/loss_slope": 2.069893535679269e-06} {"step": 8210, "timestamp": 1778334588.4237232, "train/loss": 2.393958806991577, "train/z_loss": 0.00135963266948238, "train/perplexity": 10.956783991101345, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027074.6235359404, "perf/iters_per_sec": 0.9665845029525473, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345706939697266, "data/tokens_consumed": 17219715072, "data/tokens_consumed_B": 17.219715072, "train/loss_slope": 5.044429237120816e-06} {"step": 8220, "timestamp": 1778334598.7702255, "train/loss": 2.3445207834243775, "train/z_loss": 0.0013567239278927446, "train/perplexity": 10.428274128121394, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028208.5926855353, "perf/iters_per_sec": 0.9671252215793301, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339922666549684, "data/tokens_consumed": 17240686592, "data/tokens_consumed_B": 17.240686592, "train/loss_slope": 5.04990674123873e-06} {"step": 8230, "timestamp": 1778334609.115592, "train/loss": 2.3637555837631226, "train/z_loss": 0.0013397135073319078, "train/perplexity": 10.630801440890997, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028411.9540754496, "perf/iters_per_sec": 0.9672221918465851, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338886022567748, "data/tokens_consumed": 17261658112, "data/tokens_consumed_B": 17.261658112, "train/loss_slope": 5.831830021571822e-06} {"step": 8240, "timestamp": 1778334619.487442, "train/loss": 2.3363911867141725, "train/z_loss": 0.0013406727812252939, "train/perplexity": 10.343840137245632, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023216.4210310248, "perf/iters_per_sec": 0.9647447686343311, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365435838699342, "data/tokens_consumed": 17282629632, "data/tokens_consumed_B": 17.282629632, "train/loss_slope": 4.506091702424349e-06} {"step": 8250, "timestamp": 1778334629.8546894, "grad/layer_0/attn": 0.0032055398914963007, "grad/layer_0/mlp": 0.003409921657294035, "grad/layer_0/attn_mlp_ratio": 0.940062593705985, "grad/layer_4/attn": 0.003238048404455185, "grad/layer_4/mlp": 0.002618112601339817, "grad/layer_4/attn_mlp_ratio": 1.2367872486154305, "grad/layer_8/attn": 0.00396373774856329, "grad/layer_8/mlp": 0.0035203341394662857, "grad/layer_8/attn_mlp_ratio": 1.125954946017918, "grad/layer_12/attn": 0.0043264999985694885, "grad/layer_12/mlp": 0.006688945926725864, "grad/layer_12/attn_mlp_ratio": 0.6468134114526924, "grad/layer_16/attn": 0.004491708241403103, "grad/layer_16/mlp": 0.0046775490045547485, "grad/layer_16/attn_mlp_ratio": 0.9602696072242869, "grad/layer_20/attn": 0.0029265903867781162, "grad/layer_20/mlp": 0.006280260626226664, "grad/layer_20/attn_mlp_ratio": 0.46599823070347024, "grad/layer_24/attn": 0.01040180865675211, "grad/layer_24/mlp": 0.010060569271445274, "grad/layer_24/attn_mlp_ratio": 1.0339184863905784, "grad/layer_27/attn": 0.007831394672393799, "grad/layer_27/mlp": 0.010311967693269253, "grad/layer_27/attn_mlp_ratio": 0.7594471617245979} {"step": 8250, "timestamp": 1778334630.4615643, "eos/sharpness": 56.96768760681151, "eos/L0_probe": 2.331738233566284, "eos/L_plus": 2.656710624694824, "eos/L_minus": 2.5764427185058594, "eos/grad_norm": 0.16660434007644653, "eos/embed_grad_frac": 0.08986346423625946, "eos/time_s": 0.6038458347320557} {"step": 8250, "timestamp": 1778334630.4823933, "train/loss": 2.3630661725997926, "train/z_loss": 0.0013493737205863, "train/perplexity": 10.623474973466898, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908501.6409681626, "perf/iters_per_sec": 0.9100444989052594, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0988473653793336, "data/tokens_consumed": 17303601152, "data/tokens_consumed_B": 17.303601152, "train/loss_slope": 3.4568044111387615e-06} {"step": 8250, "timestamp": 1778334631.8412685, "geo/rankme_last": 427.80059814453125, "geo/layer_0/stable_rank_q_proj": 20.810327529907227, "geo/layer_0/stable_rank_k_proj": 17.240327835083008, "geo/layer_0/stable_rank_o_proj": 44.81263732910156, "geo/layer_0/stable_rank_gate_proj": 127.56136322021484, "geo/layer_0/stable_rank_down_proj": 56.95280838012695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07072442770004272, "geo/layer_0/attn_entropy_mean": 6.244002819061279, "geo/layer_0/attn_entropy_std": 0.43670085072517395, "geo/layer_7/stable_rank_q_proj": 42.14641571044922, "geo/layer_7/stable_rank_k_proj": 38.70684814453125, "geo/layer_7/stable_rank_o_proj": 89.11641693115234, "geo/layer_7/stable_rank_gate_proj": 79.27056884765625, "geo/layer_7/stable_rank_down_proj": 144.63638305664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40437713265419006, "geo/layer_7/attn_entropy_mean": 4.730846881866455, "geo/layer_7/attn_entropy_std": 0.7909890413284302, "geo/layer_14/stable_rank_q_proj": 51.43239974975586, "geo/layer_14/stable_rank_k_proj": 42.354339599609375, "geo/layer_14/stable_rank_o_proj": 42.37715530395508, "geo/layer_14/stable_rank_gate_proj": 71.93631744384766, "geo/layer_14/stable_rank_down_proj": 127.32899475097656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3770962655544281, "geo/layer_14/attn_entropy_mean": 5.5151166915893555, "geo/layer_14/attn_entropy_std": 0.4551367461681366, "geo/layer_21/stable_rank_q_proj": 39.272186279296875, "geo/layer_21/stable_rank_k_proj": 28.729745864868164, "geo/layer_21/stable_rank_o_proj": 65.61907196044922, "geo/layer_21/stable_rank_gate_proj": 61.547760009765625, "geo/layer_21/stable_rank_down_proj": 49.89060592651367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14025261998176575, "geo/layer_21/attn_entropy_mean": 5.861644268035889, "geo/layer_21/attn_entropy_std": 0.31837958097457886, "geo/layer_27/stable_rank_q_proj": 44.40714645385742, "geo/layer_27/stable_rank_k_proj": 30.09398078918457, "geo/layer_27/stable_rank_o_proj": 107.60006713867188, "geo/layer_27/stable_rank_gate_proj": 71.43817901611328, "geo/layer_27/stable_rank_down_proj": 128.7678985595703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1050577312707901, "geo/layer_27/attn_entropy_mean": 4.322010517120361, "geo/layer_27/attn_entropy_std": 0.6686595678329468, "attnres/final_alpha/block_0": 0.2519441545009613, "attnres/block_norm/0": 1.778032898902893, "attnres/final_alpha/block_1": 0.0038983356207609177, "attnres/block_norm/1": 50479.3359375, "attnres/final_alpha/block_2": 0.008539969101548195, "attnres/block_norm/2": 29683.576171875, "attnres/final_alpha/block_3": 0.010443480685353279, "attnres/block_norm/3": 70580.8203125, "attnres/final_alpha/block_4": 0.011907713487744331, "attnres/block_norm/4": 17169.603515625, "attnres/final_alpha/block_5": 0.6135224103927612, "attnres/block_norm/5": 7101.072265625, "attnres/final_alpha/block_6": 0.09974395483732224, "attnres/block_norm/6": 47593.3125, "geo/tier1_time_s": 1.3553242683410645, "geo/step": 8250.0, "geo/rankme_slope": 0.00042803058723489394} {"step": 8260, "timestamp": 1778334642.213865, "train/loss": 2.350934314727783, "train/z_loss": 0.0013574841199442745, "train/perplexity": 10.495371125033907, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788157.6107669694, "perf/iters_per_sec": 0.8526599935374114, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1728004217147827, "data/tokens_consumed": 17324572672, "data/tokens_consumed_B": 17.324572672, "train/loss_slope": 5.715362152250695e-06} {"step": 8270, "timestamp": 1778334652.587695, "train/loss": 2.3408188819885254, "train/z_loss": 0.0013633268885314465, "train/perplexity": 10.389741051982986, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022919.7935521333, "perf/iters_per_sec": 0.9646033256302515, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366955757141114, "data/tokens_consumed": 17345544192, "data/tokens_consumed_B": 17.345544192, "train/loss_slope": 4.668912721617324e-06} {"step": 8280, "timestamp": 1778334662.9360723, "train/loss": 2.30527868270874, "train/z_loss": 0.0013655083603225648, "train/perplexity": 10.026972206868546, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027432.3779066328, "perf/iters_per_sec": 0.966755093530003, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343881368637085, "data/tokens_consumed": 17366515712, "data/tokens_consumed_B": 17.366515712, "train/loss_slope": 1.9375424538150793e-06} {"step": 8290, "timestamp": 1778334673.2922268, "train/loss": 2.3127952814102173, "train/z_loss": 0.0013436636771075427, "train/perplexity": 10.102624902421939, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.4186233184, "perf/iters_per_sec": 0.9662305920712082, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349496364593507, "data/tokens_consumed": 17387487232, "data/tokens_consumed_B": 17.387487232, "train/loss_slope": -6.13177937857568e-06} {"step": 8300, "timestamp": 1778334683.6282692, "grad/layer_0/attn": 0.0028004115447402, "grad/layer_0/mlp": 0.0031731834169477224, "grad/layer_0/attn_mlp_ratio": 0.8825242945399858, "grad/layer_4/attn": 0.0019077991601079702, "grad/layer_4/mlp": 0.0025231419131159782, "grad/layer_4/attn_mlp_ratio": 0.7561204047139287, "grad/layer_8/attn": 0.0072868093848228455, "grad/layer_8/mlp": 0.0032497900538146496, "grad/layer_8/attn_mlp_ratio": 2.242239972408522, "grad/layer_12/attn": 0.006748335435986519, "grad/layer_12/mlp": 0.006883831229060888, "grad/layer_12/attn_mlp_ratio": 0.9803167906653439, "grad/layer_16/attn": 0.0044398619793355465, "grad/layer_16/mlp": 0.004340142942965031, "grad/layer_16/attn_mlp_ratio": 1.0229759561800966, "grad/layer_20/attn": 0.0038281481247395277, "grad/layer_20/mlp": 0.006396796554327011, "grad/layer_20/attn_mlp_ratio": 0.5984476811764898, "grad/layer_24/attn": 0.011453834362328053, "grad/layer_24/mlp": 0.012014176696538925, "grad/layer_24/attn_mlp_ratio": 0.9533598977524371, "grad/layer_27/attn": 0.008054226636886597, "grad/layer_27/mlp": 0.012135655619204044, "grad/layer_27/attn_mlp_ratio": 0.6636828551539413} {"step": 8300, "timestamp": 1778334683.644035, "train/loss": 2.3874821424484254, "train/z_loss": 0.0013497785781510173, "train/perplexity": 10.886049884517133, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027177.0264107306, "perf/iters_per_sec": 0.966633332448354, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345184326171875, "data/tokens_consumed": 17408458752, "data/tokens_consumed_B": 17.408458752, "train/loss_slope": -2.7867216338562092e-06} {"step": 8310, "timestamp": 1778334693.9969654, "train/loss": 2.3736027002334597, "train/z_loss": 0.001342415704857558, "train/perplexity": 10.736001288364456, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026723.1595275316, "perf/iters_per_sec": 0.9664169118535669, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347501039505005, "data/tokens_consumed": 17429430272, "data/tokens_consumed_B": 17.429430272, "train/loss_slope": -2.9770545118247946e-06} {"step": 8320, "timestamp": 1778334704.3552203, "train/loss": 2.3760473012924193, "train/z_loss": 0.001347209047526121, "train/perplexity": 10.762278634210444, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025876.9234763256, "perf/iters_per_sec": 0.9660133950597408, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351823329925538, "data/tokens_consumed": 17450401792, "data/tokens_consumed_B": 17.450401792, "train/loss_slope": 1.1781191203529619e-07} {"step": 8325, "timestamp": 1778334710.125321, "eos/sharpness": 57.881116867065415, "eos/L0_probe": 2.3325798511505127, "eos/L_plus": 2.661978244781494, "eos/L_minus": 2.5819926261901855, "eos/grad_norm": 0.1923762857913971, "eos/embed_grad_frac": 0.07302907854318619, "eos/time_s": 0.6042256355285645} {"step": 8325, "timestamp": 1778334711.503969, "geo/rankme_last": 428.74468994140625, "geo/layer_0/stable_rank_q_proj": 20.811317443847656, "geo/layer_0/stable_rank_k_proj": 17.258333206176758, "geo/layer_0/stable_rank_o_proj": 44.723358154296875, "geo/layer_0/stable_rank_gate_proj": 127.74337768554688, "geo/layer_0/stable_rank_down_proj": 56.946659088134766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061980266124010086, "geo/layer_0/attn_entropy_mean": 6.243802547454834, "geo/layer_0/attn_entropy_std": 0.4337003827095032, "geo/layer_7/stable_rank_q_proj": 42.210479736328125, "geo/layer_7/stable_rank_k_proj": 38.65842819213867, "geo/layer_7/stable_rank_o_proj": 89.06046295166016, "geo/layer_7/stable_rank_gate_proj": 79.26367950439453, "geo/layer_7/stable_rank_down_proj": 144.61959838867188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40660592913627625, "geo/layer_7/attn_entropy_mean": 4.717440605163574, "geo/layer_7/attn_entropy_std": 0.7621675729751587, "geo/layer_14/stable_rank_q_proj": 51.42251205444336, "geo/layer_14/stable_rank_k_proj": 42.394371032714844, "geo/layer_14/stable_rank_o_proj": 42.43113708496094, "geo/layer_14/stable_rank_gate_proj": 71.9051513671875, "geo/layer_14/stable_rank_down_proj": 127.20921325683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3751152455806732, "geo/layer_14/attn_entropy_mean": 5.534158706665039, "geo/layer_14/attn_entropy_std": 0.47418808937072754, "geo/layer_21/stable_rank_q_proj": 39.311363220214844, "geo/layer_21/stable_rank_k_proj": 28.658178329467773, "geo/layer_21/stable_rank_o_proj": 65.63561248779297, "geo/layer_21/stable_rank_gate_proj": 61.472389221191406, "geo/layer_21/stable_rank_down_proj": 49.913631439208984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13572418689727783, "geo/layer_21/attn_entropy_mean": 5.863833904266357, "geo/layer_21/attn_entropy_std": 0.3178230822086334, "geo/layer_27/stable_rank_q_proj": 44.385494232177734, "geo/layer_27/stable_rank_k_proj": 30.11701011657715, "geo/layer_27/stable_rank_o_proj": 107.64424133300781, "geo/layer_27/stable_rank_gate_proj": 71.47565460205078, "geo/layer_27/stable_rank_down_proj": 129.23793029785156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10493555665016174, "geo/layer_27/attn_entropy_mean": 4.318246841430664, "geo/layer_27/attn_entropy_std": 0.6860498189926147, "attnres/final_alpha/block_0": 0.2541279196739197, "attnres/block_norm/0": 1.7781062126159668, "attnres/final_alpha/block_1": 0.003945138305425644, "attnres/block_norm/1": 50415.00390625, "attnres/final_alpha/block_2": 0.008671144023537636, "attnres/block_norm/2": 29828.93359375, "attnres/final_alpha/block_3": 0.010563505813479424, "attnres/block_norm/3": 70980.2890625, "attnres/final_alpha/block_4": 0.012301212176680565, "attnres/block_norm/4": 17167.46484375, "attnres/final_alpha/block_5": 0.6098790764808655, "attnres/block_norm/5": 7176.75390625, "attnres/final_alpha/block_6": 0.10051199793815613, "attnres/block_norm/6": 47404.515625, "geo/tier1_time_s": 1.3586552143096924, "geo/step": 8325.0, "geo/rankme_slope": 0.00042803639033738494} {"step": 8330, "timestamp": 1778334716.6817722, "train/loss": 2.3731573104858397, "train/z_loss": 0.0013600177480839193, "train/perplexity": 10.73122064816328, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702283.9803878677, "perf/iters_per_sec": 0.811712255662855, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2319636583328246, "data/tokens_consumed": 17471373312, "data/tokens_consumed_B": 17.471373312, "train/loss_slope": 2.605277684369364e-06} {"step": 8340, "timestamp": 1778334727.0387995, "train/loss": 2.325690984725952, "train/z_loss": 0.00137042774586007, "train/perplexity": 10.233749007504686, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025782.256598537, "perf/iters_per_sec": 0.9659682543747602, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352307081222534, "data/tokens_consumed": 17492344832, "data/tokens_consumed_B": 17.492344832, "train/loss_slope": -2.803480736028472e-07} {"step": 8350, "timestamp": 1778334737.3904626, "grad/layer_0/attn": 0.002944111591205001, "grad/layer_0/mlp": 0.0033061944413930178, "grad/layer_0/attn_mlp_ratio": 0.8904834710556813, "grad/layer_4/attn": 0.002144385129213333, "grad/layer_4/mlp": 0.002706632949411869, "grad/layer_4/attn_mlp_ratio": 0.7922703558501555, "grad/layer_8/attn": 0.007086481899023056, "grad/layer_8/mlp": 0.0034583748783916235, "grad/layer_8/attn_mlp_ratio": 2.049078524827503, "grad/layer_12/attn": 0.005103331059217453, "grad/layer_12/mlp": 0.0066216387785971165, "grad/layer_12/attn_mlp_ratio": 0.7707051309778861, "grad/layer_16/attn": 0.005103960167616606, "grad/layer_16/mlp": 0.004342308267951012, "grad/layer_16/attn_mlp_ratio": 1.1754025129323076, "grad/layer_20/attn": 0.0032324406784027815, "grad/layer_20/mlp": 0.0057121641002595425, "grad/layer_20/attn_mlp_ratio": 0.5658872128108484, "grad/layer_24/attn": 0.005015391390770674, "grad/layer_24/mlp": 0.00832906924188137, "grad/layer_24/attn_mlp_ratio": 0.6021550769846031, "grad/layer_27/attn": 0.0072932615876197815, "grad/layer_27/mlp": 0.0066749053075909615, "grad/layer_27/attn_mlp_ratio": 1.0926389427669794} {"step": 8350, "timestamp": 1778334737.4063811, "train/loss": 2.337498688697815, "train/z_loss": 0.0013630751287564634, "train/perplexity": 10.355302306732247, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024080.8822561388, "perf/iters_per_sec": 0.9651569758682913, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361008882522582, "data/tokens_consumed": 17513316352, "data/tokens_consumed_B": 17.513316352, "train/loss_slope": 6.367458368685051e-07} {"step": 8360, "timestamp": 1778334747.7499154, "train/loss": 2.406253480911255, "train/z_loss": 0.001344049023464322, "train/perplexity": 11.09232559000399, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028637.3455888547, "perf/iters_per_sec": 0.9673296668953203, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337737321853637, "data/tokens_consumed": 17534287872, "data/tokens_consumed_B": 17.534287872, "train/loss_slope": 6.4261592403746875e-06} {"step": 8370, "timestamp": 1778334758.1167524, "train/loss": 2.3644103527069094, "train/z_loss": 0.0013561381492763758, "train/perplexity": 10.6377644388507, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023723.099033648, "perf/iters_per_sec": 0.964986371533226, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362840652465821, "data/tokens_consumed": 17555259392, "data/tokens_consumed_B": 17.555259392, "train/loss_slope": 7.704780955161538e-06} {"step": 8380, "timestamp": 1778334768.4942532, "train/loss": 2.362807297706604, "train/z_loss": 0.0013364710146561265, "train/perplexity": 10.62072517845965, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022085.7944292447, "perf/iters_per_sec": 0.9642056438585495, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037123155593872, "data/tokens_consumed": 17576230912, "data/tokens_consumed_B": 17.576230912, "train/loss_slope": 7.503959905840097e-06} {"step": 8390, "timestamp": 1778334779.2812884, "train/loss": 2.3550952672958374, "train/z_loss": 0.0013469878351315856, "train/perplexity": 10.539132848558113, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945463.2915479692, "perf/iters_per_sec": 0.9276691873302313, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0779704809188844, "data/tokens_consumed": 17597202432, "data/tokens_consumed_B": 17.597202432, "train/loss_slope": 4.128987560964616e-06} {"step": 8400, "timestamp": 1778334789.6625612, "grad/layer_0/attn": 0.0031129580456763506, "grad/layer_0/mlp": 0.0031853613909333944, "grad/layer_0/attn_mlp_ratio": 0.9772699439410163, "grad/layer_4/attn": 0.0019009137758985162, "grad/layer_4/mlp": 0.0026666398625820875, "grad/layer_4/attn_mlp_ratio": 0.712849804462495, "grad/layer_8/attn": 0.0043072933331131935, "grad/layer_8/mlp": 0.00336873228661716, "grad/layer_8/attn_mlp_ratio": 1.2786095298708262, "grad/layer_12/attn": 0.005244456697255373, "grad/layer_12/mlp": 0.0071864258497953415, "grad/layer_12/attn_mlp_ratio": 0.7297725926480503, "grad/layer_16/attn": 0.004588177427649498, "grad/layer_16/mlp": 0.004745452664792538, "grad/layer_16/attn_mlp_ratio": 0.9668576751390517, "grad/layer_20/attn": 0.002772929845377803, "grad/layer_20/mlp": 0.0059807454235851765, "grad/layer_20/attn_mlp_ratio": 0.46364284092053487, "grad/layer_24/attn": 0.010613941587507725, "grad/layer_24/mlp": 0.010813331231474876, "grad/layer_24/attn_mlp_ratio": 0.9815607477607962, "grad/layer_27/attn": 0.007379109971225262, "grad/layer_27/mlp": 0.010769203305244446, "grad/layer_27/attn_mlp_ratio": 0.6852048098220285} {"step": 8400, "timestamp": 1778334790.2592027, "eos/sharpness": 50.651812553405755, "eos/L0_probe": 2.3337182998657227, "eos/L_plus": 2.608375310897827, "eos/L_minus": 2.565579414367676, "eos/grad_norm": 0.16874264180660248, "eos/embed_grad_frac": 0.0764201283454895, "eos/time_s": 0.5937554836273193} {"step": 8400, "timestamp": 1778334790.2786753, "train/loss": 2.336449646949768, "train/z_loss": 0.0013603047002106905, "train/perplexity": 10.344444858252912, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907896.3498784618, "perf/iters_per_sec": 0.9097558736221608, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0991959810256957, "data/tokens_consumed": 17618173952, "data/tokens_consumed_B": 17.618173952, "train/loss_slope": 2.706490541555942e-06} {"step": 8400, "timestamp": 1778334791.638579, "geo/rankme_last": 428.940185546875, "geo/layer_0/stable_rank_q_proj": 20.785606384277344, "geo/layer_0/stable_rank_k_proj": 17.2310848236084, "geo/layer_0/stable_rank_o_proj": 44.698795318603516, "geo/layer_0/stable_rank_gate_proj": 127.59398651123047, "geo/layer_0/stable_rank_down_proj": 56.94151306152344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06462017446756363, "geo/layer_0/attn_entropy_mean": 6.243646621704102, "geo/layer_0/attn_entropy_std": 0.43198493123054504, "geo/layer_7/stable_rank_q_proj": 42.192142486572266, "geo/layer_7/stable_rank_k_proj": 38.675079345703125, "geo/layer_7/stable_rank_o_proj": 88.98872375488281, "geo/layer_7/stable_rank_gate_proj": 79.11943817138672, "geo/layer_7/stable_rank_down_proj": 144.61561584472656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39947137236595154, "geo/layer_7/attn_entropy_mean": 4.717419624328613, "geo/layer_7/attn_entropy_std": 0.7550737261772156, "geo/layer_14/stable_rank_q_proj": 51.427642822265625, "geo/layer_14/stable_rank_k_proj": 42.407169342041016, "geo/layer_14/stable_rank_o_proj": 42.43107223510742, "geo/layer_14/stable_rank_gate_proj": 71.95777130126953, "geo/layer_14/stable_rank_down_proj": 127.14588928222656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36470264196395874, "geo/layer_14/attn_entropy_mean": 5.485041618347168, "geo/layer_14/attn_entropy_std": 0.4609094560146332, "geo/layer_21/stable_rank_q_proj": 39.33909225463867, "geo/layer_21/stable_rank_k_proj": 28.67401885986328, "geo/layer_21/stable_rank_o_proj": 65.6143798828125, "geo/layer_21/stable_rank_gate_proj": 61.442657470703125, "geo/layer_21/stable_rank_down_proj": 49.898704528808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13883797824382782, "geo/layer_21/attn_entropy_mean": 5.865417003631592, "geo/layer_21/attn_entropy_std": 0.3180324137210846, "geo/layer_27/stable_rank_q_proj": 44.405399322509766, "geo/layer_27/stable_rank_k_proj": 30.122467041015625, "geo/layer_27/stable_rank_o_proj": 107.56790924072266, "geo/layer_27/stable_rank_gate_proj": 71.50971984863281, "geo/layer_27/stable_rank_down_proj": 129.1973876953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10092148184776306, "geo/layer_27/attn_entropy_mean": 4.31207275390625, "geo/layer_27/attn_entropy_std": 0.6624860167503357, "attnres/final_alpha/block_0": 0.2539488971233368, "attnres/block_norm/0": 1.7779332399368286, "attnres/final_alpha/block_1": 0.0038764639757573605, "attnres/block_norm/1": 50429.5234375, "attnres/final_alpha/block_2": 0.008599640801548958, "attnres/block_norm/2": 29861.59375, "attnres/final_alpha/block_3": 0.010718109086155891, "attnres/block_norm/3": 70865.6171875, "attnres/final_alpha/block_4": 0.011891325935721397, "attnres/block_norm/4": 17149.390625, "attnres/final_alpha/block_5": 0.6122568249702454, "attnres/block_norm/5": 7087.97802734375, "attnres/final_alpha/block_6": 0.09870870411396027, "attnres/block_norm/6": 47431.62109375, "geo/tier1_time_s": 1.356130838394165, "geo/step": 8400.0, "geo/rankme_slope": 0.00041835153983468387} {"step": 8410, "timestamp": 1778334802.0198588, "train/loss": 2.331962323188782, "train/z_loss": 0.0013552952092140912, "train/perplexity": 10.2981299776949, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786715.041161754, "perf/iters_per_sec": 0.8519721227463503, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1737473249435424, "data/tokens_consumed": 17639145472, "data/tokens_consumed_B": 17.639145472, "train/loss_slope": 4.1998513568244955e-06} {"step": 8420, "timestamp": 1778334812.3965735, "train/loss": 2.3081746101379395, "train/z_loss": 0.0013508962816558777, "train/perplexity": 10.056051676408309, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022087.9327243103, "perf/iters_per_sec": 0.9642066634770919, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371220588684082, "data/tokens_consumed": 17660116992, "data/tokens_consumed_B": 17.660116992, "train/loss_slope": -2.822927337060744e-07} {"step": 8430, "timestamp": 1778334822.780965, "train/loss": 2.375116539001465, "train/z_loss": 0.0013442912604659796, "train/perplexity": 10.75226617142721, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020474.666914744, "perf/iters_per_sec": 0.9634373983930321, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379501581192017, "data/tokens_consumed": 17681088512, "data/tokens_consumed_B": 17.681088512, "train/loss_slope": 1.6050183948295795e-06} {"step": 8440, "timestamp": 1778334833.1613567, "train/loss": 2.316849684715271, "train/z_loss": 0.0013539870269596577, "train/perplexity": 10.143668164962634, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021779.0424380172, "perf/iters_per_sec": 0.9640593731107794, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037280511856079, "data/tokens_consumed": 17702060032, "data/tokens_consumed_B": 17.702060032, "train/loss_slope": -1.2409071288522459e-06} {"step": 8450, "timestamp": 1778334843.5268297, "grad/layer_0/attn": 0.00312570808455348, "grad/layer_0/mlp": 0.0034704715944826603, "grad/layer_0/attn_mlp_ratio": 0.9006579968719285, "grad/layer_4/attn": 0.002534307539463043, "grad/layer_4/mlp": 0.0026473025791347027, "grad/layer_4/attn_mlp_ratio": 0.9573168793420354, "grad/layer_8/attn": 0.004005108959972858, "grad/layer_8/mlp": 0.003532977541908622, "grad/layer_8/attn_mlp_ratio": 1.133635523888902, "grad/layer_12/attn": 0.006711478345096111, "grad/layer_12/mlp": 0.00783576536923647, "grad/layer_12/attn_mlp_ratio": 0.8565185330578928, "grad/layer_16/attn": 0.004482737276703119, "grad/layer_16/mlp": 0.005040970165282488, "grad/layer_16/attn_mlp_ratio": 0.8892608051223874, "grad/layer_20/attn": 0.004119627643376589, "grad/layer_20/mlp": 0.007103347219526768, "grad/layer_20/attn_mlp_ratio": 0.5799558233696284, "grad/layer_24/attn": 0.011274086311459541, "grad/layer_24/mlp": 0.009785152040421963, "grad/layer_24/attn_mlp_ratio": 1.15216259795153, "grad/layer_27/attn": 0.005403086077421904, "grad/layer_27/mlp": 0.00946781039237976, "grad/layer_27/attn_mlp_ratio": 0.5706795759981275} {"step": 8450, "timestamp": 1778334843.5424893, "train/loss": 2.307573699951172, "train/z_loss": 0.0013585055130533874, "train/perplexity": 10.05001070773888, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021125.970837455, "perf/iters_per_sec": 0.9637479643046641, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03761568069458, "data/tokens_consumed": 17723031552, "data/tokens_consumed_B": 17.723031552, "train/loss_slope": -4.415082874292397e-06} {"step": 8460, "timestamp": 1778334853.9240408, "train/loss": 2.3459649085998535, "train/z_loss": 0.0013381097582168878, "train/perplexity": 10.443344740632904, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021051.7151950991, "perf/iters_per_sec": 0.9637125564551826, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376538038253784, "data/tokens_consumed": 17744003072, "data/tokens_consumed_B": 17.744003072, "train/loss_slope": -4.590220355501157e-06} {"step": 8470, "timestamp": 1778334864.301274, "train/loss": 2.332106852531433, "train/z_loss": 0.001353174145333469, "train/perplexity": 10.29961846721373, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022175.4672496675, "perf/iters_per_sec": 0.9642484031914079, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370771646499635, "data/tokens_consumed": 17764974592, "data/tokens_consumed_B": 17.764974592, "train/loss_slope": -4.83796409826687e-06} {"step": 8475, "timestamp": 1778334870.0742288, "eos/sharpness": 54.34629917144775, "eos/L0_probe": 2.3322877883911133, "eos/L_plus": 2.602277994155884, "eos/L_minus": 2.6057605743408203, "eos/grad_norm": 0.22773456573486328, "eos/embed_grad_frac": 0.04707438126206398, "eos/time_s": 0.597559928894043} {"step": 8475, "timestamp": 1778334871.45263, "geo/rankme_last": 427.58868408203125, "geo/layer_0/stable_rank_q_proj": 20.77322006225586, "geo/layer_0/stable_rank_k_proj": 17.185789108276367, "geo/layer_0/stable_rank_o_proj": 44.726287841796875, "geo/layer_0/stable_rank_gate_proj": 127.55607604980469, "geo/layer_0/stable_rank_down_proj": 56.927215576171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060878999531269073, "geo/layer_0/attn_entropy_mean": 6.248314380645752, "geo/layer_0/attn_entropy_std": 0.4334571957588196, "geo/layer_7/stable_rank_q_proj": 42.22890853881836, "geo/layer_7/stable_rank_k_proj": 38.66510009765625, "geo/layer_7/stable_rank_o_proj": 89.10323333740234, "geo/layer_7/stable_rank_gate_proj": 79.13006591796875, "geo/layer_7/stable_rank_down_proj": 144.3173065185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39456307888031006, "geo/layer_7/attn_entropy_mean": 4.736626625061035, "geo/layer_7/attn_entropy_std": 0.7631762027740479, "geo/layer_14/stable_rank_q_proj": 51.41622543334961, "geo/layer_14/stable_rank_k_proj": 42.531646728515625, "geo/layer_14/stable_rank_o_proj": 42.432254791259766, "geo/layer_14/stable_rank_gate_proj": 72.02069091796875, "geo/layer_14/stable_rank_down_proj": 127.04326629638672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37360990047454834, "geo/layer_14/attn_entropy_mean": 5.493590354919434, "geo/layer_14/attn_entropy_std": 0.4385729730129242, "geo/layer_21/stable_rank_q_proj": 39.2886962890625, "geo/layer_21/stable_rank_k_proj": 28.651220321655273, "geo/layer_21/stable_rank_o_proj": 65.60494995117188, "geo/layer_21/stable_rank_gate_proj": 61.43035125732422, "geo/layer_21/stable_rank_down_proj": 49.87559127807617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13272163271903992, "geo/layer_21/attn_entropy_mean": 5.852398872375488, "geo/layer_21/attn_entropy_std": 0.32954296469688416, "geo/layer_27/stable_rank_q_proj": 44.42550277709961, "geo/layer_27/stable_rank_k_proj": 30.085193634033203, "geo/layer_27/stable_rank_o_proj": 107.78148651123047, "geo/layer_27/stable_rank_gate_proj": 71.44440460205078, "geo/layer_27/stable_rank_down_proj": 128.96728515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11342564225196838, "geo/layer_27/attn_entropy_mean": 4.324735641479492, "geo/layer_27/attn_entropy_std": 0.6740140318870544, "attnres/final_alpha/block_0": 0.2589975595474243, "attnres/block_norm/0": 1.7780513763427734, "attnres/final_alpha/block_1": 0.004066030029207468, "attnres/block_norm/1": 50332.1015625, "attnres/final_alpha/block_2": 0.008952626958489418, "attnres/block_norm/2": 29845.41796875, "attnres/final_alpha/block_3": 0.011078402400016785, "attnres/block_norm/3": 70592.7578125, "attnres/final_alpha/block_4": 0.012535357847809792, "attnres/block_norm/4": 17105.478515625, "attnres/final_alpha/block_5": 0.6012583374977112, "attnres/block_norm/5": 7163.857421875, "attnres/final_alpha/block_6": 0.10311168432235718, "attnres/block_norm/6": 47259.50390625, "geo/tier1_time_s": 1.3592936992645264, "geo/step": 8475.0, "geo/rankme_slope": 0.00037818758362720086} {"step": 8480, "timestamp": 1778334876.6500676, "train/loss": 2.3818742990493775, "train/z_loss": 0.001346637972164899, "train/perplexity": 10.82517347375589, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698979.7483809788, "perf/iters_per_sec": 0.8101366750626463, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.234359622001648, "data/tokens_consumed": 17785946112, "data/tokens_consumed_B": 17.785946112, "train/loss_slope": -4.166293966852474e-06} {"step": 8490, "timestamp": 1778334887.0364017, "train/loss": 2.30853853225708, "train/z_loss": 0.0013700090115889908, "train/perplexity": 10.05971196203363, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020350.247809094, "perf/iters_per_sec": 0.9633780707402677, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380140781402587, "data/tokens_consumed": 17806917632, "data/tokens_consumed_B": 17.806917632, "train/loss_slope": -4.458901562420383e-06} {"step": 8500, "timestamp": 1778334897.4113233, "grad/layer_0/attn": 0.0029661040753126144, "grad/layer_0/mlp": 0.003208385780453682, "grad/layer_0/attn_mlp_ratio": 0.9244848300146465, "grad/layer_4/attn": 0.002100911922752857, "grad/layer_4/mlp": 0.00262324046343565, "grad/layer_4/attn_mlp_ratio": 0.8008842010285551, "grad/layer_8/attn": 0.009432877413928509, "grad/layer_8/mlp": 0.003451145486906171, "grad/layer_8/attn_mlp_ratio": 2.73325977603418, "grad/layer_12/attn": 0.01006026566028595, "grad/layer_12/mlp": 0.007933373562991619, "grad/layer_12/attn_mlp_ratio": 1.2680942670349773, "grad/layer_16/attn": 0.004092106595635414, "grad/layer_16/mlp": 0.0048787593841552734, "grad/layer_16/attn_mlp_ratio": 0.8387596496456388, "grad/layer_20/attn": 0.007168211974203587, "grad/layer_20/mlp": 0.006679135840386152, "grad/layer_20/attn_mlp_ratio": 1.0732244467222445, "grad/layer_24/attn": 0.004633000586181879, "grad/layer_24/mlp": 0.007656267378479242, "grad/layer_24/attn_mlp_ratio": 0.6051252257323341, "grad/layer_27/attn": 0.0045676506124436855, "grad/layer_27/mlp": 0.006657721940428019, "grad/layer_27/attn_mlp_ratio": 0.686068085256085} {"step": 8500, "timestamp": 1778334897.4269102, "train/loss": 2.3344648361206053, "train/z_loss": 0.0013669052626937629, "train/perplexity": 10.323933454438459, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019782.968442442, "perf/iters_per_sec": 0.9631075708591661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383056163787843, "data/tokens_consumed": 17827889152, "data/tokens_consumed_B": 17.827889152, "train/loss_slope": -5.80127859416044e-06} {"step": 8500, "timestamp": 1778334904.481719, "geo/ww_alpha_mean": 7.9290143600600125, "geo/ww_alpha_std": 6.144716517895969, "geo/ww_alpha_min": 1.3528318864742637, "geo/ww_alpha_max": 69.58848562559103, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.868442838899579, "geo/ww_alpha_by_type/k_proj": 4.391399155159483, "geo/ww_alpha_by_type/v_proj": 11.08756767416474, "geo/ww_alpha_by_type/o_proj": 8.829402705725863, "geo/ww_alpha_by_type/gate_proj": 7.716447342442024, "geo/ww_alpha_by_type/up_proj": 11.648585540367094, "geo/ww_alpha_by_type/down_proj": 8.065736011638691, "geo/twonn_id/layer_0": 0.7586870193481445, "geo/twonn_id/layer_7": 3.71467924118042, "geo/twonn_id/layer_14": 5.950919151306152, "geo/twonn_id/layer_21": 7.654026508331299, "geo/twonn_id/layer_27": 6.457942485809326, "geo/tier2_time_s": 7.049068212509155} {"step": 8500, "timestamp": 1778334905.2413092, "eoc/jacobian_sigma/layer_0/attn": 1542.4219970703125, "eoc/jacobian_sigma/layer_0/mlp": 10887.1630859375, "eoc/jacobian_sigma/layer_0": 10887.1630859375, "eoc/jacobian_sigma/layer_7/attn": 1.139979362487793, "eoc/jacobian_sigma/layer_7/mlp": 1.8094159364700317, "eoc/jacobian_sigma/layer_7": 1.8094159364700317, "eoc/jacobian_sigma/layer_14/attn": 1.8923590183258057, "eoc/jacobian_sigma/layer_14/mlp": 12.526663780212402, "eoc/jacobian_sigma/layer_14": 12.526663780212402, "eoc/jacobian_sigma/layer_21/attn": 1.099933385848999, "eoc/jacobian_sigma/layer_21/mlp": 6.456267356872559, "eoc/jacobian_sigma/layer_21": 6.456267356872559, "eoc/jacobian_sigma/layer_27/attn": 3.6816534996032715, "eoc/jacobian_sigma/layer_27/mlp": 31.462705612182617, "eoc/jacobian_sigma/layer_27": 31.462705612182617, "eoc/layer0_sigma": 10887.1630859375, "eoc/sigma_max": 31.462705612182617, "eoc/sigma_min": 1.8094159364700317, "eoc/sigma_mean": 13.063763171434402, "eoc/time_s": 0.7532963752746582} {"step": 8510, "timestamp": 1778334915.6418824, "train/loss": 2.347134232521057, "train/z_loss": 0.0013496998813934623, "train/perplexity": 10.455563535928068, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1151636.6759052805, "perf/iters_per_sec": 0.5491431598211672, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8210187673568725, "data/tokens_consumed": 17848860672, "data/tokens_consumed_B": 17.848860672, "train/loss_slope": -5.387837601394771e-06} {"step": 8520, "timestamp": 1778334926.0182467, "train/loss": 2.378750705718994, "train/z_loss": 0.0013396951369941234, "train/perplexity": 10.791412788867875, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022527.7273657776, "perf/iters_per_sec": 0.9644163739041222, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036896538734436, "data/tokens_consumed": 17869832192, "data/tokens_consumed_B": 17.869832192, "train/loss_slope": -5.503369922792958e-07} {"step": 8530, "timestamp": 1778334936.8215556, "train/loss": 2.410731840133667, "train/z_loss": 0.0013435784610919654, "train/perplexity": 11.14211240702446, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1942128.6711127313, "perf/iters_per_sec": 0.9260791163982064, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0798213481903076, "data/tokens_consumed": 17890803712, "data/tokens_consumed_B": 17.890803712, "train/loss_slope": 3.881146204639638e-06} {"step": 8540, "timestamp": 1778334947.196353, "train/loss": 2.3583417654037477, "train/z_loss": 0.0013548480230383575, "train/perplexity": 10.573403723474726, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022561.0719625873, "perf/iters_per_sec": 0.9644322738469063, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368794441223144, "data/tokens_consumed": 17911775232, "data/tokens_consumed_B": 17.911775232, "train/loss_slope": 4.216793310953373e-06} {"step": 8550, "timestamp": 1778334957.5692997, "grad/layer_0/attn": 0.003255055518820882, "grad/layer_0/mlp": 0.003342686453834176, "grad/layer_0/attn_mlp_ratio": 0.9737842500031065, "grad/layer_4/attn": 0.0022198865190148354, "grad/layer_4/mlp": 0.002805291907861829, "grad/layer_4/attn_mlp_ratio": 0.7913210149936638, "grad/layer_8/attn": 0.003485599299892783, "grad/layer_8/mlp": 0.0035626599565148354, "grad/layer_8/attn_mlp_ratio": 0.9783698822229374, "grad/layer_12/attn": 0.0070551251992583275, "grad/layer_12/mlp": 0.0072549814358353615, "grad/layer_12/attn_mlp_ratio": 0.9724525368410849, "grad/layer_16/attn": 0.004330504685640335, "grad/layer_16/mlp": 0.004858005326241255, "grad/layer_16/attn_mlp_ratio": 0.8914161895020652, "grad/layer_20/attn": 0.0038196786772459745, "grad/layer_20/mlp": 0.005974994041025639, "grad/layer_20/attn_mlp_ratio": 0.639277392929846, "grad/layer_24/attn": 0.011476777493953705, "grad/layer_24/mlp": 0.011481953784823418, "grad/layer_24/attn_mlp_ratio": 0.9995491716025307, "grad/layer_27/attn": 0.00564810074865818, "grad/layer_27/mlp": 0.011396841146051884, "grad/layer_27/attn_mlp_ratio": 0.4955847525396396} {"step": 8550, "timestamp": 1778334958.1831067, "eos/sharpness": 40.72973728179931, "eos/L0_probe": 2.3271279335021973, "eos/L_plus": 2.529588222503662, "eos/L_minus": 2.5319650173187256, "eos/grad_norm": 0.14746049046516418, "eos/embed_grad_frac": 0.1331721991300583, "eos/time_s": 0.6108536720275879} {"step": 8550, "timestamp": 1778334958.2027302, "train/loss": 2.3342483758926393, "train/z_loss": 0.0013591597322374581, "train/perplexity": 10.321698975296064, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906655.012308698, "perf/iters_per_sec": 0.9091639577430238, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.099911618232727, "data/tokens_consumed": 17932746752, "data/tokens_consumed_B": 17.932746752, "train/loss_slope": 2.9966600585763362e-06} {"step": 8550, "timestamp": 1778334959.5648491, "geo/rankme_last": 428.26123046875, "geo/layer_0/stable_rank_q_proj": 20.757436752319336, "geo/layer_0/stable_rank_k_proj": 17.210094451904297, "geo/layer_0/stable_rank_o_proj": 44.76875305175781, "geo/layer_0/stable_rank_gate_proj": 127.39791107177734, "geo/layer_0/stable_rank_down_proj": 56.9205207824707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06269131600856781, "geo/layer_0/attn_entropy_mean": 6.245091915130615, "geo/layer_0/attn_entropy_std": 0.4323810338973999, "geo/layer_7/stable_rank_q_proj": 42.261390686035156, "geo/layer_7/stable_rank_k_proj": 38.66960525512695, "geo/layer_7/stable_rank_o_proj": 89.01144409179688, "geo/layer_7/stable_rank_gate_proj": 79.1566390991211, "geo/layer_7/stable_rank_down_proj": 144.52450561523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4004462957382202, "geo/layer_7/attn_entropy_mean": 4.735079288482666, "geo/layer_7/attn_entropy_std": 0.749818742275238, "geo/layer_14/stable_rank_q_proj": 51.40055465698242, "geo/layer_14/stable_rank_k_proj": 42.578224182128906, "geo/layer_14/stable_rank_o_proj": 42.44195556640625, "geo/layer_14/stable_rank_gate_proj": 71.99758911132812, "geo/layer_14/stable_rank_down_proj": 126.64700317382812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37373003363609314, "geo/layer_14/attn_entropy_mean": 5.498086929321289, "geo/layer_14/attn_entropy_std": 0.46347564458847046, "geo/layer_21/stable_rank_q_proj": 39.328731536865234, "geo/layer_21/stable_rank_k_proj": 28.771772384643555, "geo/layer_21/stable_rank_o_proj": 65.64889526367188, "geo/layer_21/stable_rank_gate_proj": 61.48595428466797, "geo/layer_21/stable_rank_down_proj": 49.80375671386719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1373714953660965, "geo/layer_21/attn_entropy_mean": 5.862052917480469, "geo/layer_21/attn_entropy_std": 0.3281301259994507, "geo/layer_27/stable_rank_q_proj": 44.41249084472656, "geo/layer_27/stable_rank_k_proj": 30.139265060424805, "geo/layer_27/stable_rank_o_proj": 107.82640075683594, "geo/layer_27/stable_rank_gate_proj": 71.3719711303711, "geo/layer_27/stable_rank_down_proj": 128.74208068847656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10256468504667282, "geo/layer_27/attn_entropy_mean": 4.334344863891602, "geo/layer_27/attn_entropy_std": 0.7024691104888916, "attnres/final_alpha/block_0": 0.25668948888778687, "attnres/block_norm/0": 1.7782293558120728, "attnres/final_alpha/block_1": 0.004050863441079855, "attnres/block_norm/1": 50400.484375, "attnres/final_alpha/block_2": 0.008795695379376411, "attnres/block_norm/2": 29839.021484375, "attnres/final_alpha/block_3": 0.010937835089862347, "attnres/block_norm/3": 70375.25, "attnres/final_alpha/block_4": 0.012244722805917263, "attnres/block_norm/4": 17130.193359375, "attnres/final_alpha/block_5": 0.6053051948547363, "attnres/block_norm/5": 7127.1181640625, "attnres/final_alpha/block_6": 0.10197616368532181, "attnres/block_norm/6": 47524.1796875, "geo/tier1_time_s": 1.3581128120422363, "geo/step": 8550.0, "geo/rankme_slope": 0.0003517000745610744} {"step": 8560, "timestamp": 1778334969.943497, "train/loss": 2.333470177650452, "train/z_loss": 0.001355734432581812, "train/perplexity": 10.313669771858303, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786801.6037427045, "perf/iters_per_sec": 0.8520133990014574, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1736904621124267, "data/tokens_consumed": 17953718272, "data/tokens_consumed_B": 17.953718272, "train/loss_slope": 4.686166858873381e-06} {"step": 8570, "timestamp": 1778334980.3171926, "train/loss": 2.345234727859497, "train/z_loss": 0.001358428702224046, "train/perplexity": 10.435721994768183, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022859.780689915, "perf/iters_per_sec": 0.9645747092675757, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367263317108155, "data/tokens_consumed": 17974689792, "data/tokens_consumed_B": 17.974689792, "train/loss_slope": 2.053994700388605e-06} {"step": 8580, "timestamp": 1778334990.7069223, "train/loss": 2.323719358444214, "train/z_loss": 0.0013541834778152406, "train/perplexity": 10.213591756813182, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019980.2831727043, "perf/iters_per_sec": 0.963201657854416, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0382041931152344, "data/tokens_consumed": 17995661312, "data/tokens_consumed_B": 17.995661312, "train/loss_slope": 3.7322539379028415e-07} {"step": 8590, "timestamp": 1778335001.678554, "train/loss": 2.350005316734314, "train/z_loss": 0.001346875529270619, "train/perplexity": 10.48562547386413, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912322.9676294932, "perf/iters_per_sec": 0.9118666494510141, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0966515779495238, "data/tokens_consumed": 18016632832, "data/tokens_consumed_B": 18.016632832, "train/loss_slope": 8.659187394721969e-07} {"step": 8600, "timestamp": 1778335012.0427372, "grad/layer_0/attn": 0.0026201442815363407, "grad/layer_0/mlp": 0.0031838007271289825, "grad/layer_0/attn_mlp_ratio": 0.8229611159122276, "grad/layer_4/attn": 0.001903009251691401, "grad/layer_4/mlp": 0.0027081025764346123, "grad/layer_4/attn_mlp_ratio": 0.7027094165413376, "grad/layer_8/attn": 0.0029483609832823277, "grad/layer_8/mlp": 0.0035028792917728424, "grad/layer_8/attn_mlp_ratio": 0.841696402738586, "grad/layer_12/attn": 0.008613626472651958, "grad/layer_12/mlp": 0.0069947452284395695, "grad/layer_12/attn_mlp_ratio": 1.2314424711976668, "grad/layer_16/attn": 0.003603033721446991, "grad/layer_16/mlp": 0.004403823055326939, "grad/layer_16/attn_mlp_ratio": 0.8181604016248248, "grad/layer_20/attn": 0.007824813947081566, "grad/layer_20/mlp": 0.006197754759341478, "grad/layer_20/attn_mlp_ratio": 1.2625239501507108, "grad/layer_24/attn": 0.007761494722217321, "grad/layer_24/mlp": 0.010223384015262127, "grad/layer_24/attn_mlp_ratio": 0.7591903654124139, "grad/layer_27/attn": 0.016910376027226448, "grad/layer_27/mlp": 0.009997807443141937, "grad/layer_27/attn_mlp_ratio": 1.6914084367253333} {"step": 8600, "timestamp": 1778335012.058215, "train/loss": 2.322134494781494, "train/z_loss": 0.0013704560929909348, "train/perplexity": 10.197417426811446, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021652.464656506, "perf/iters_per_sec": 0.9639990161211519, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373454570770264, "data/tokens_consumed": 18037604352, "data/tokens_consumed_B": 18.037604352, "train/loss_slope": -2.079334811265868e-06} {"step": 8610, "timestamp": 1778335022.4376554, "train/loss": 2.3498247861862183, "train/z_loss": 0.0013486375333741306, "train/perplexity": 10.4837326690099, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022068.6417476851, "perf/iters_per_sec": 0.964197464822619, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037131953239441, "data/tokens_consumed": 18058575872, "data/tokens_consumed_B": 18.058575872, "train/loss_slope": -1.1393948499769444e-06} {"step": 8620, "timestamp": 1778335032.8162656, "train/loss": 2.347062087059021, "train/z_loss": 0.0013531557400710882, "train/perplexity": 10.4548092416757, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021516.5180716058, "perf/iters_per_sec": 0.9639341917379407, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374152183532714, "data/tokens_consumed": 18079547392, "data/tokens_consumed_B": 18.079547392, "train/loss_slope": 6.695550517423734e-08} {"step": 8625, "timestamp": 1778335038.597191, "eos/sharpness": 53.82537841796874, "eos/L0_probe": 2.328247308731079, "eos/L_plus": 2.6460371017456055, "eos/L_minus": 2.5487112998962402, "eos/grad_norm": 0.1624845564365387, "eos/embed_grad_frac": 0.09826330095529556, "eos/time_s": 0.5997600555419922} {"step": 8625, "timestamp": 1778335039.976094, "geo/rankme_last": 427.8064880371094, "geo/layer_0/stable_rank_q_proj": 20.77246856689453, "geo/layer_0/stable_rank_k_proj": 17.16126823425293, "geo/layer_0/stable_rank_o_proj": 44.77227020263672, "geo/layer_0/stable_rank_gate_proj": 127.28173828125, "geo/layer_0/stable_rank_down_proj": 56.85517120361328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06508149951696396, "geo/layer_0/attn_entropy_mean": 6.240110397338867, "geo/layer_0/attn_entropy_std": 0.4336262345314026, "geo/layer_7/stable_rank_q_proj": 42.255733489990234, "geo/layer_7/stable_rank_k_proj": 38.636199951171875, "geo/layer_7/stable_rank_o_proj": 89.052978515625, "geo/layer_7/stable_rank_gate_proj": 79.08338928222656, "geo/layer_7/stable_rank_down_proj": 144.7157440185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3978823721408844, "geo/layer_7/attn_entropy_mean": 4.72810697555542, "geo/layer_7/attn_entropy_std": 0.762075662612915, "geo/layer_14/stable_rank_q_proj": 51.438873291015625, "geo/layer_14/stable_rank_k_proj": 42.56843948364258, "geo/layer_14/stable_rank_o_proj": 42.390838623046875, "geo/layer_14/stable_rank_gate_proj": 71.93880462646484, "geo/layer_14/stable_rank_down_proj": 126.4575424194336, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38117751479148865, "geo/layer_14/attn_entropy_mean": 5.533438682556152, "geo/layer_14/attn_entropy_std": 0.45081639289855957, "geo/layer_21/stable_rank_q_proj": 39.28438186645508, "geo/layer_21/stable_rank_k_proj": 28.740150451660156, "geo/layer_21/stable_rank_o_proj": 65.6629638671875, "geo/layer_21/stable_rank_gate_proj": 61.51987838745117, "geo/layer_21/stable_rank_down_proj": 49.7615852355957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13575290143489838, "geo/layer_21/attn_entropy_mean": 5.877415180206299, "geo/layer_21/attn_entropy_std": 0.3257925510406494, "geo/layer_27/stable_rank_q_proj": 44.4838752746582, "geo/layer_27/stable_rank_k_proj": 30.124217987060547, "geo/layer_27/stable_rank_o_proj": 107.84442138671875, "geo/layer_27/stable_rank_gate_proj": 71.36331176757812, "geo/layer_27/stable_rank_down_proj": 129.0262451171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10897035151720047, "geo/layer_27/attn_entropy_mean": 4.322628021240234, "geo/layer_27/attn_entropy_std": 0.6994255185127258, "attnres/final_alpha/block_0": 0.25404584407806396, "attnres/block_norm/0": 1.7782365083694458, "attnres/final_alpha/block_1": 0.003940282389521599, "attnres/block_norm/1": 50358.3671875, "attnres/final_alpha/block_2": 0.00853879377245903, "attnres/block_norm/2": 29871.8984375, "attnres/final_alpha/block_3": 0.010569466277956963, "attnres/block_norm/3": 71494.90625, "attnres/final_alpha/block_4": 0.01204054057598114, "attnres/block_norm/4": 17137.658203125, "attnres/final_alpha/block_5": 0.6118000149726868, "attnres/block_norm/5": 7129.033203125, "attnres/final_alpha/block_6": 0.09906508773565292, "attnres/block_norm/6": 47804.0625, "geo/tier1_time_s": 1.357388973236084, "geo/step": 8625.0, "geo/rankme_slope": 0.00032424159116771706} {"step": 8630, "timestamp": 1778335045.1676488, "train/loss": 2.3047083616256714, "train/z_loss": 0.0013547007576562465, "train/perplexity": 10.02125524362692, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698722.1156599298, "perf/iters_per_sec": 0.8100138262080812, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2345468282699585, "data/tokens_consumed": 18100518912, "data/tokens_consumed_B": 18.100518912, "train/loss_slope": -2.969492472509264e-06} {"step": 8640, "timestamp": 1778335055.5424325, "train/loss": 2.348942518234253, "train/z_loss": 0.0013523177010938525, "train/perplexity": 10.474487286711028, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022223.6307409042, "perf/iters_per_sec": 0.9642713693336984, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370524644851684, "data/tokens_consumed": 18121490432, "data/tokens_consumed_B": 18.121490432, "train/loss_slope": -2.6788828575392943e-06} {"step": 8650, "timestamp": 1778335065.9167705, "grad/layer_0/attn": 0.0030153561383485794, "grad/layer_0/mlp": 0.0033701984211802483, "grad/layer_0/attn_mlp_ratio": 0.8947117267420192, "grad/layer_4/attn": 0.002028045477345586, "grad/layer_4/mlp": 0.002751019550487399, "grad/layer_4/attn_mlp_ratio": 0.7371977430209473, "grad/layer_8/attn": 0.003933948464691639, "grad/layer_8/mlp": 0.0036531533114612103, "grad/layer_8/attn_mlp_ratio": 1.0768637452644274, "grad/layer_12/attn": 0.009553519077599049, "grad/layer_12/mlp": 0.006874392740428448, "grad/layer_12/attn_mlp_ratio": 1.3897255073080206, "grad/layer_16/attn": 0.005015794653445482, "grad/layer_16/mlp": 0.004525304771959782, "grad/layer_16/attn_mlp_ratio": 1.1083882291610738, "grad/layer_20/attn": 0.003921203315258026, "grad/layer_20/mlp": 0.007754385471343994, "grad/layer_20/attn_mlp_ratio": 0.5056755663206468, "grad/layer_24/attn": 0.020590607076883316, "grad/layer_24/mlp": 0.017387352883815765, "grad/layer_24/attn_mlp_ratio": 1.184228967804884, "grad/layer_27/attn": 0.009051201865077019, "grad/layer_27/mlp": 0.020711727440357208, "grad/layer_27/attn_mlp_ratio": 0.43700854250040577} {"step": 8650, "timestamp": 1778335065.932513, "train/loss": 2.369494318962097, "train/z_loss": 0.0013359544798731803, "train/perplexity": 10.691984183180258, "train/grad_norm": 0.36328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019600.8089081547, "perf/iters_per_sec": 0.963020710424497, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383992671966553, "data/tokens_consumed": 18142461952, "data/tokens_consumed_B": 18.142461952, "train/loss_slope": -3.581460088071258e-07} {"step": 8660, "timestamp": 1778335076.3240304, "train/loss": 2.3450109720230103, "train/z_loss": 0.0013531151926144958, "train/perplexity": 10.433387202285363, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019464.07172594, "perf/iters_per_sec": 0.9629555090551091, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0384695768356322, "data/tokens_consumed": 18163433472, "data/tokens_consumed_B": 18.163433472, "train/loss_slope": 2.8042156632654087e-07} {"step": 8670, "timestamp": 1778335086.6988623, "train/loss": 2.3988979339599608, "train/z_loss": 0.0013398037757724524, "train/perplexity": 11.011034803937692, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022372.598761542, "perf/iters_per_sec": 0.9643424028213224, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369760751724244, "data/tokens_consumed": 18184404992, "data/tokens_consumed_B": 18.184404992, "train/loss_slope": 2.5700726667896582e-06} {"step": 8680, "timestamp": 1778335097.0784569, "train/loss": 2.3626734256744384, "train/z_loss": 0.0013513805344700814, "train/perplexity": 10.619303455563529, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021362.1019590367, "perf/iters_per_sec": 0.9638605603976425, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374944686889649, "data/tokens_consumed": 18205376512, "data/tokens_consumed_B": 18.205376512, "train/loss_slope": 4.342478644264851e-06} {"step": 8690, "timestamp": 1778335107.45946, "train/loss": 2.3630121469497682, "train/z_loss": 0.0013486026437021792, "train/perplexity": 10.622901048829405, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021767.9360438543, "perf/iters_per_sec": 0.9640540771693489, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372862100601197, "data/tokens_consumed": 18226348032, "data/tokens_consumed_B": 18.226348032, "train/loss_slope": 7.070557591151869e-06} {"step": 8700, "timestamp": 1778335117.8212116, "grad/layer_0/attn": 0.003889933694154024, "grad/layer_0/mlp": 0.0036116596311330795, "grad/layer_0/attn_mlp_ratio": 1.077048776390029, "grad/layer_4/attn": 0.0020946187432855368, "grad/layer_4/mlp": 0.0024849995970726013, "grad/layer_4/attn_mlp_ratio": 0.8429050296275914, "grad/layer_8/attn": 0.003868148662149906, "grad/layer_8/mlp": 0.003369144396856427, "grad/layer_8/attn_mlp_ratio": 1.1481100516048135, "grad/layer_12/attn": 0.006210292223840952, "grad/layer_12/mlp": 0.0075223930180072784, "grad/layer_12/attn_mlp_ratio": 0.8255740063590417, "grad/layer_16/attn": 0.004943538922816515, "grad/layer_16/mlp": 0.005022713448852301, "grad/layer_16/attn_mlp_ratio": 0.984236682967143, "grad/layer_20/attn": 0.004825914278626442, "grad/layer_20/mlp": 0.00627252459526062, "grad/layer_20/attn_mlp_ratio": 0.7693734999995768, "grad/layer_24/attn": 0.01874796487390995, "grad/layer_24/mlp": 0.014832787215709686, "grad/layer_24/attn_mlp_ratio": 1.263954270688802, "grad/layer_27/attn": 0.0102924183011055, "grad/layer_27/mlp": 0.01580841653048992, "grad/layer_27/attn_mlp_ratio": 0.6510720549491568} {"step": 8700, "timestamp": 1778335118.4142704, "eos/sharpness": 66.84942245483397, "eos/L0_probe": 2.327626943588257, "eos/L_plus": 2.7191720008850098, "eos/L_minus": 2.6045761108398438, "eos/grad_norm": 0.27381277084350586, "eos/embed_grad_frac": 0.033171314746141434, "eos/time_s": 0.590308666229248} {"step": 8700, "timestamp": 1778335118.4332993, "train/loss": 2.313446378707886, "train/z_loss": 0.001358557539060712, "train/perplexity": 10.109204836051315, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911899.370255351, "perf/iters_per_sec": 0.9116646624829059, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0968945503234864, "data/tokens_consumed": 18247319552, "data/tokens_consumed_B": 18.247319552, "train/loss_slope": 3.258817989190279e-06} {"step": 8700, "timestamp": 1778335119.7930303, "geo/rankme_last": 429.1161193847656, "geo/layer_0/stable_rank_q_proj": 20.74656105041504, "geo/layer_0/stable_rank_k_proj": 17.162168502807617, "geo/layer_0/stable_rank_o_proj": 44.727996826171875, "geo/layer_0/stable_rank_gate_proj": 127.23369598388672, "geo/layer_0/stable_rank_down_proj": 56.846107482910156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06675643473863602, "geo/layer_0/attn_entropy_mean": 6.235823631286621, "geo/layer_0/attn_entropy_std": 0.43743276596069336, "geo/layer_7/stable_rank_q_proj": 42.293479919433594, "geo/layer_7/stable_rank_k_proj": 38.610774993896484, "geo/layer_7/stable_rank_o_proj": 89.17684936523438, "geo/layer_7/stable_rank_gate_proj": 79.0400161743164, "geo/layer_7/stable_rank_down_proj": 144.4713134765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4047275185585022, "geo/layer_7/attn_entropy_mean": 4.706167697906494, "geo/layer_7/attn_entropy_std": 0.7591027617454529, "geo/layer_14/stable_rank_q_proj": 51.58811569213867, "geo/layer_14/stable_rank_k_proj": 42.609825134277344, "geo/layer_14/stable_rank_o_proj": 42.335105895996094, "geo/layer_14/stable_rank_gate_proj": 71.9311752319336, "geo/layer_14/stable_rank_down_proj": 126.46110534667969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37118563055992126, "geo/layer_14/attn_entropy_mean": 5.540119171142578, "geo/layer_14/attn_entropy_std": 0.45732536911964417, "geo/layer_21/stable_rank_q_proj": 39.304080963134766, "geo/layer_21/stable_rank_k_proj": 28.829919815063477, "geo/layer_21/stable_rank_o_proj": 65.58588409423828, "geo/layer_21/stable_rank_gate_proj": 61.5308837890625, "geo/layer_21/stable_rank_down_proj": 49.71337127685547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13976767659187317, "geo/layer_21/attn_entropy_mean": 5.866855621337891, "geo/layer_21/attn_entropy_std": 0.3209395408630371, "geo/layer_27/stable_rank_q_proj": 44.39973449707031, "geo/layer_27/stable_rank_k_proj": 30.10407829284668, "geo/layer_27/stable_rank_o_proj": 107.94586181640625, "geo/layer_27/stable_rank_gate_proj": 71.3941879272461, "geo/layer_27/stable_rank_down_proj": 129.004150390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09302495419979095, "geo/layer_27/attn_entropy_mean": 4.340854644775391, "geo/layer_27/attn_entropy_std": 0.6970290541648865, "attnres/final_alpha/block_0": 0.2539350390434265, "attnres/block_norm/0": 1.7782766819000244, "attnres/final_alpha/block_1": 0.003900178475305438, "attnres/block_norm/1": 50449.6796875, "attnres/final_alpha/block_2": 0.008505800738930702, "attnres/block_norm/2": 29911.107421875, "attnres/final_alpha/block_3": 0.01080925203859806, "attnres/block_norm/3": 70424.484375, "attnres/final_alpha/block_4": 0.011937015689909458, "attnres/block_norm/4": 17155.650390625, "attnres/final_alpha/block_5": 0.6126669049263, "attnres/block_norm/5": 7124.1220703125, "attnres/final_alpha/block_6": 0.09824579209089279, "attnres/block_norm/6": 47630.578125, "geo/tier1_time_s": 1.3556866645812988, "geo/step": 8700.0, "geo/rankme_slope": 0.00030405537214885954} {"step": 8710, "timestamp": 1778335130.169764, "train/loss": 2.36508309841156, "train/z_loss": 0.0013577333884313703, "train/perplexity": 10.644923356979731, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787457.7921195566, "perf/iters_per_sec": 0.8523262940023215, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1732595920562745, "data/tokens_consumed": 18268291072, "data/tokens_consumed_B": 18.268291072, "train/loss_slope": 4.631236119560739e-06} {"step": 8720, "timestamp": 1778335140.5490487, "train/loss": 2.3677537202835084, "train/z_loss": 0.0013580673257820308, "train/perplexity": 10.673389916911733, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022024.9014027317, "perf/iters_per_sec": 0.9641766078008326, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371543884277343, "data/tokens_consumed": 18289262592, "data/tokens_consumed_B": 18.289262592, "train/loss_slope": 5.7250514318018e-06} {"step": 8730, "timestamp": 1778335150.9224494, "train/loss": 2.3539785861968996, "train/z_loss": 0.0013504815055057407, "train/perplexity": 10.52737056668813, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022588.5111583574, "perf/iters_per_sec": 0.9644453578750407, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368653774261474, "data/tokens_consumed": 18310234112, "data/tokens_consumed_B": 18.310234112, "train/loss_slope": 6.894860423580515e-06} {"step": 8740, "timestamp": 1778335161.299485, "train/loss": 2.317817497253418, "train/z_loss": 0.0013550476171076298, "train/perplexity": 10.15349008631815, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021874.4509254126, "perf/iters_per_sec": 0.9641048674227775, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372315645217896, "data/tokens_consumed": 18331205632, "data/tokens_consumed_B": 18.331205632, "train/loss_slope": 1.762194451790747e-06} {"step": 8750, "timestamp": 1778335171.6717904, "grad/layer_0/attn": 0.003449154319241643, "grad/layer_0/mlp": 0.00356867304071784, "grad/layer_0/attn_mlp_ratio": 0.9665088909061154, "grad/layer_4/attn": 0.002233545994386077, "grad/layer_4/mlp": 0.002622397616505623, "grad/layer_4/attn_mlp_ratio": 0.8517190128438277, "grad/layer_8/attn": 0.005992047488689423, "grad/layer_8/mlp": 0.0035019218921661377, "grad/layer_8/attn_mlp_ratio": 1.7110739479902008, "grad/layer_12/attn": 0.005405241623520851, "grad/layer_12/mlp": 0.006846929434686899, "grad/layer_12/attn_mlp_ratio": 0.78944022954196, "grad/layer_16/attn": 0.0059020486660301685, "grad/layer_16/mlp": 0.004587174858897924, "grad/layer_16/attn_mlp_ratio": 1.2866412811618908, "grad/layer_20/attn": 0.00377103453502059, "grad/layer_20/mlp": 0.005700340960174799, "grad/layer_20/attn_mlp_ratio": 0.6615454224952906, "grad/layer_24/attn": 0.011265809647738934, "grad/layer_24/mlp": 0.010542072355747223, "grad/layer_24/attn_mlp_ratio": 1.068652268804807, "grad/layer_27/attn": 0.007617715280503035, "grad/layer_27/mlp": 0.009586326777935028, "grad/layer_27/attn_mlp_ratio": 0.7946438064861765} {"step": 8750, "timestamp": 1778335171.687515, "train/loss": 2.341966247558594, "train/z_loss": 0.0013532304321415723, "train/perplexity": 10.401668724539856, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020173.507139605, "perf/iters_per_sec": 0.9632937942216897, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381048917770386, "data/tokens_consumed": 18352177152, "data/tokens_consumed_B": 18.352177152, "train/loss_slope": 2.965587486635532e-07} {"step": 8760, "timestamp": 1778335182.0622635, "train/loss": 2.353920340538025, "train/z_loss": 0.0013532249955460428, "train/perplexity": 10.52675741091026, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022389.059121491, "perf/iters_per_sec": 0.9643502517325835, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369676351547241, "data/tokens_consumed": 18373148672, "data/tokens_consumed_B": 18.373148672, "train/loss_slope": 4.445859114758215e-06} {"step": 8770, "timestamp": 1778335192.4374459, "train/loss": 2.318121409416199, "train/z_loss": 0.001344120071735233, "train/perplexity": 10.15657632439895, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022241.7158875156, "perf/iters_per_sec": 0.9642799930036142, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370431900024415, "data/tokens_consumed": 18394120192, "data/tokens_consumed_B": 18.394120192, "train/loss_slope": 2.2033029156263498e-06} {"step": 8775, "timestamp": 1778335198.2147925, "eos/sharpness": 67.23744869232176, "eos/L0_probe": 2.3309743404388428, "eos/L_plus": 2.630983829498291, "eos/L_minus": 2.7033393383026123, "eos/grad_norm": 0.22703686356544495, "eos/embed_grad_frac": 0.050444863736629486, "eos/time_s": 0.5982546806335449} {"step": 8775, "timestamp": 1778335199.595644, "geo/rankme_last": 428.8763427734375, "geo/layer_0/stable_rank_q_proj": 20.720661163330078, "geo/layer_0/stable_rank_k_proj": 17.176645278930664, "geo/layer_0/stable_rank_o_proj": 44.74421310424805, "geo/layer_0/stable_rank_gate_proj": 127.16204071044922, "geo/layer_0/stable_rank_down_proj": 56.855445861816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06526730209589005, "geo/layer_0/attn_entropy_mean": 6.234212398529053, "geo/layer_0/attn_entropy_std": 0.4339940547943115, "geo/layer_7/stable_rank_q_proj": 42.17842483520508, "geo/layer_7/stable_rank_k_proj": 38.567935943603516, "geo/layer_7/stable_rank_o_proj": 89.06206512451172, "geo/layer_7/stable_rank_gate_proj": 78.95250701904297, "geo/layer_7/stable_rank_down_proj": 144.27508544921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4022538661956787, "geo/layer_7/attn_entropy_mean": 4.71605110168457, "geo/layer_7/attn_entropy_std": 0.7704796195030212, "geo/layer_14/stable_rank_q_proj": 51.61821746826172, "geo/layer_14/stable_rank_k_proj": 42.63669967651367, "geo/layer_14/stable_rank_o_proj": 42.34696960449219, "geo/layer_14/stable_rank_gate_proj": 71.98111724853516, "geo/layer_14/stable_rank_down_proj": 126.4418716430664, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3617876470088959, "geo/layer_14/attn_entropy_mean": 5.5384955406188965, "geo/layer_14/attn_entropy_std": 0.44742727279663086, "geo/layer_21/stable_rank_q_proj": 39.22193908691406, "geo/layer_21/stable_rank_k_proj": 28.784221649169922, "geo/layer_21/stable_rank_o_proj": 65.4302978515625, "geo/layer_21/stable_rank_gate_proj": 61.45514678955078, "geo/layer_21/stable_rank_down_proj": 49.70170974731445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13425640761852264, "geo/layer_21/attn_entropy_mean": 5.850473880767822, "geo/layer_21/attn_entropy_std": 0.32583925127983093, "geo/layer_27/stable_rank_q_proj": 44.42916488647461, "geo/layer_27/stable_rank_k_proj": 30.097543716430664, "geo/layer_27/stable_rank_o_proj": 107.84967803955078, "geo/layer_27/stable_rank_gate_proj": 71.26908111572266, "geo/layer_27/stable_rank_down_proj": 129.27880859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09164875000715256, "geo/layer_27/attn_entropy_mean": 4.302424430847168, "geo/layer_27/attn_entropy_std": 0.7017181515693665, "attnres/final_alpha/block_0": 0.25657689571380615, "attnres/block_norm/0": 1.7785389423370361, "attnres/final_alpha/block_1": 0.004038914106786251, "attnres/block_norm/1": 50340.65625, "attnres/final_alpha/block_2": 0.00873682089149952, "attnres/block_norm/2": 29821.7890625, "attnres/final_alpha/block_3": 0.010767747648060322, "attnres/block_norm/3": 71502.921875, "attnres/final_alpha/block_4": 0.01223667711019516, "attnres/block_norm/4": 17227.755859375, "attnres/final_alpha/block_5": 0.6051972508430481, "attnres/block_norm/5": 7089.7607421875, "attnres/final_alpha/block_6": 0.10244572907686234, "attnres/block_norm/6": 47483.5546875, "geo/tier1_time_s": 1.361788034439087, "geo/step": 8775.0, "geo/rankme_slope": 0.0002903431880564726} {"step": 8780, "timestamp": 1778335204.7845047, "train/loss": 2.335415172576904, "train/z_loss": 0.0013666836428456008, "train/perplexity": 10.333749328225212, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699427.0507222991, "perf/iters_per_sec": 0.8103499654399391, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340347290039062, "data/tokens_consumed": 18415091712, "data/tokens_consumed_B": 18.415091712, "train/loss_slope": 2.381035060522119e-07} {"step": 8790, "timestamp": 1778335215.1635184, "train/loss": 2.3228175401687623, "train/z_loss": 0.0013497801846824586, "train/perplexity": 10.204385105096236, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021738.7532968116, "perf/iters_per_sec": 0.9640401617511805, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373011827468872, "data/tokens_consumed": 18436063232, "data/tokens_consumed_B": 18.436063232, "train/loss_slope": -1.1398532316915706e-06} {"step": 8800, "timestamp": 1778335225.5337284, "grad/layer_0/attn": 0.0028482333291321993, "grad/layer_0/mlp": 0.003086712211370468, "grad/layer_0/attn_mlp_ratio": 0.9227401331313618, "grad/layer_4/attn": 0.0023538589011877775, "grad/layer_4/mlp": 0.0026822069194167852, "grad/layer_4/attn_mlp_ratio": 0.8775828577540583, "grad/layer_8/attn": 0.003596043447032571, "grad/layer_8/mlp": 0.0034224858973175287, "grad/layer_8/attn_mlp_ratio": 1.0507109305490432, "grad/layer_12/attn": 0.00626295804977417, "grad/layer_12/mlp": 0.006613509729504585, "grad/layer_12/attn_mlp_ratio": 0.9469945930727263, "grad/layer_16/attn": 0.004590724129229784, "grad/layer_16/mlp": 0.004848450422286987, "grad/layer_16/attn_mlp_ratio": 0.9468435550961063, "grad/layer_20/attn": 0.005772230215370655, "grad/layer_20/mlp": 0.006419872399419546, "grad/layer_20/attn_mlp_ratio": 0.8991191360720252, "grad/layer_24/attn": 0.013897079974412918, "grad/layer_24/mlp": 0.011743878945708275, "grad/layer_24/attn_mlp_ratio": 1.1833466540590365, "grad/layer_27/attn": 0.004572734236717224, "grad/layer_27/mlp": 0.012966981157660484, "grad/layer_27/attn_mlp_ratio": 0.35264447027836926} {"step": 8800, "timestamp": 1778335225.549299, "train/loss": 2.3677153825759887, "train/z_loss": 0.0013516467995941639, "train/perplexity": 10.67298073145452, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020477.544374866, "perf/iters_per_sec": 0.9634387704729395, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379486799240112, "data/tokens_consumed": 18457034752, "data/tokens_consumed_B": 18.457034752, "train/loss_slope": -1.7895033293001053e-07} {"step": 8810, "timestamp": 1778335235.9234927, "train/loss": 2.336467432975769, "train/z_loss": 0.0013563641346991061, "train/perplexity": 10.34462884645433, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022428.3976772302, "perf/iters_per_sec": 0.96436900981771, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369474649429322, "data/tokens_consumed": 18478006272, "data/tokens_consumed_B": 18.478006272, "train/loss_slope": -1.3763064634252788e-06} {"step": 8820, "timestamp": 1778335246.3034444, "train/loss": 2.3447962045669555, "train/z_loss": 0.001352751604281366, "train/perplexity": 10.431146690860976, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021335.2534301553, "perf/iters_per_sec": 0.9638477580214287, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037508249282837, "data/tokens_consumed": 18498977792, "data/tokens_consumed_B": 18.498977792, "train/loss_slope": -8.757298106443566e-07} {"step": 8830, "timestamp": 1778335256.689091, "train/loss": 2.3364209651947023, "train/z_loss": 0.0013484698720276356, "train/perplexity": 10.34414816567405, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020616.6468584314, "perf/iters_per_sec": 0.9635050997059018, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378772258758544, "data/tokens_consumed": 18519949312, "data/tokens_consumed_B": 18.519949312, "train/loss_slope": -2.671792576558389e-06} {"step": 8840, "timestamp": 1778335267.0683851, "train/loss": 2.3601160049438477, "train/z_loss": 0.0013489744043909014, "train/perplexity": 10.592180126427063, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021466.9016581, "perf/iters_per_sec": 0.9639105327883244, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374406814575194, "data/tokens_consumed": 18540920832, "data/tokens_consumed_B": 18.540920832, "train/loss_slope": -9.289433496190837e-07} {"step": 8850, "timestamp": 1778335277.4467435, "grad/layer_0/attn": 0.0031608149874955416, "grad/layer_0/mlp": 0.003424563445150852, "grad/layer_0/attn_mlp_ratio": 0.9229833074556972, "grad/layer_4/attn": 0.0017953033093363047, "grad/layer_4/mlp": 0.002656500553712249, "grad/layer_4/attn_mlp_ratio": 0.6758151204772003, "grad/layer_8/attn": 0.0031472728587687016, "grad/layer_8/mlp": 0.0034093863796442747, "grad/layer_8/attn_mlp_ratio": 0.9231200034256833, "grad/layer_12/attn": 0.005500946659594774, "grad/layer_12/mlp": 0.007196361199021339, "grad/layer_12/attn_mlp_ratio": 0.7644066815187377, "grad/layer_16/attn": 0.003541353391483426, "grad/layer_16/mlp": 0.004910514689981937, "grad/layer_16/attn_mlp_ratio": 0.7211776245350546, "grad/layer_20/attn": 0.0035853618755936623, "grad/layer_20/mlp": 0.006707505788654089, "grad/layer_20/attn_mlp_ratio": 0.5345298140786414, "grad/layer_24/attn": 0.014826265163719654, "grad/layer_24/mlp": 0.011977970600128174, "grad/layer_24/attn_mlp_ratio": 1.237794408994589, "grad/layer_27/attn": 0.010059265419840813, "grad/layer_27/mlp": 0.010996432043612003, "grad/layer_27/attn_mlp_ratio": 0.9147753824575179} {"step": 8850, "timestamp": 1778335278.0499804, "eos/sharpness": 71.04454040527342, "eos/L0_probe": 2.326401948928833, "eos/L_plus": 2.625152826309204, "eos/L_minus": 2.7380964756011963, "eos/grad_norm": 0.22997838258743286, "eos/embed_grad_frac": 0.039289169013500214, "eos/time_s": 0.6003866195678711} {"step": 8850, "timestamp": 1778335278.0717883, "train/loss": 2.3391278743743897, "train/z_loss": 0.001369127386715263, "train/perplexity": 10.372186767152888, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906691.7544578095, "perf/iters_per_sec": 0.9091814777649925, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0998904228210449, "data/tokens_consumed": 18561892352, "data/tokens_consumed_B": 18.561892352, "train/loss_slope": -6.268744922206446e-07} {"step": 8850, "timestamp": 1778335279.4321575, "geo/rankme_last": 429.17913818359375, "geo/layer_0/stable_rank_q_proj": 20.721717834472656, "geo/layer_0/stable_rank_k_proj": 17.188766479492188, "geo/layer_0/stable_rank_o_proj": 44.714332580566406, "geo/layer_0/stable_rank_gate_proj": 127.23727416992188, "geo/layer_0/stable_rank_down_proj": 56.8726692199707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0641016811132431, "geo/layer_0/attn_entropy_mean": 6.235239028930664, "geo/layer_0/attn_entropy_std": 0.4378848671913147, "geo/layer_7/stable_rank_q_proj": 42.189388275146484, "geo/layer_7/stable_rank_k_proj": 38.495731353759766, "geo/layer_7/stable_rank_o_proj": 89.25950622558594, "geo/layer_7/stable_rank_gate_proj": 78.95744323730469, "geo/layer_7/stable_rank_down_proj": 144.2354736328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40199965238571167, "geo/layer_7/attn_entropy_mean": 4.739995956420898, "geo/layer_7/attn_entropy_std": 0.7608326077461243, "geo/layer_14/stable_rank_q_proj": 51.61788558959961, "geo/layer_14/stable_rank_k_proj": 42.66730499267578, "geo/layer_14/stable_rank_o_proj": 42.37675476074219, "geo/layer_14/stable_rank_gate_proj": 71.90751647949219, "geo/layer_14/stable_rank_down_proj": 126.1583251953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3798280954360962, "geo/layer_14/attn_entropy_mean": 5.509263038635254, "geo/layer_14/attn_entropy_std": 0.45165112614631653, "geo/layer_21/stable_rank_q_proj": 39.1839714050293, "geo/layer_21/stable_rank_k_proj": 28.73441505432129, "geo/layer_21/stable_rank_o_proj": 65.36019134521484, "geo/layer_21/stable_rank_gate_proj": 61.34016418457031, "geo/layer_21/stable_rank_down_proj": 49.670555114746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1352415233850479, "geo/layer_21/attn_entropy_mean": 5.83765172958374, "geo/layer_21/attn_entropy_std": 0.326960951089859, "geo/layer_27/stable_rank_q_proj": 44.43095779418945, "geo/layer_27/stable_rank_k_proj": 30.07232666015625, "geo/layer_27/stable_rank_o_proj": 107.78044128417969, "geo/layer_27/stable_rank_gate_proj": 71.1625747680664, "geo/layer_27/stable_rank_down_proj": 129.12998962402344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09505733102560043, "geo/layer_27/attn_entropy_mean": 4.304177284240723, "geo/layer_27/attn_entropy_std": 0.6993378400802612, "attnres/final_alpha/block_0": 0.2593206763267517, "attnres/block_norm/0": 1.778562307357788, "attnres/final_alpha/block_1": 0.004016215447336435, "attnres/block_norm/1": 50379.8828125, "attnres/final_alpha/block_2": 0.008835255168378353, "attnres/block_norm/2": 29702.060546875, "attnres/final_alpha/block_3": 0.010963727720081806, "attnres/block_norm/3": 70615.421875, "attnres/final_alpha/block_4": 0.012206435203552246, "attnres/block_norm/4": 17196.443359375, "attnres/final_alpha/block_5": 0.6000679731369019, "attnres/block_norm/5": 7187.39453125, "attnres/final_alpha/block_6": 0.10458972305059433, "attnres/block_norm/6": 46793.4921875, "geo/tier1_time_s": 1.356652021408081, "geo/step": 8850.0, "geo/rankme_slope": 0.00029717285351640654} {"step": 8860, "timestamp": 1778335289.8156016, "train/loss": 2.347792053222656, "train/z_loss": 0.001374138321261853, "train/perplexity": 10.462443684772952, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786274.6999889344, "perf/iters_per_sec": 0.8517621517128632, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1740366697311402, "data/tokens_consumed": 18582863872, "data/tokens_consumed_B": 18.582863872, "train/loss_slope": -8.779870544580042e-07} {"step": 8870, "timestamp": 1778335300.19142, "train/loss": 2.3611632585525513, "train/z_loss": 0.001356288231909275, "train/perplexity": 10.603278635751126, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022085.7944292447, "perf/iters_per_sec": 0.9642056438585495, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037123155593872, "data/tokens_consumed": 18603835392, "data/tokens_consumed_B": 18.603835392, "train/loss_slope": -1.6658158192146563e-06} {"step": 8880, "timestamp": 1778335310.568763, "train/loss": 2.3267060041427614, "train/z_loss": 0.0013548135990276932, "train/perplexity": 10.24414173497187, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021802.5103249094, "perf/iters_per_sec": 0.9640705634712741, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372684717178344, "data/tokens_consumed": 18624806912, "data/tokens_consumed_B": 18.624806912, "train/loss_slope": -9.85896018686626e-07} {"step": 8890, "timestamp": 1778335320.9568915, "train/loss": 2.3898478984832763, "train/z_loss": 0.0013400725438259542, "train/perplexity": 10.91183411029522, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020064.6198444401, "perf/iters_per_sec": 0.9632418727132989, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381608486175538, "data/tokens_consumed": 18645778432, "data/tokens_consumed_B": 18.645778432, "train/loss_slope": 2.1499254093347877e-07} {"step": 8900, "timestamp": 1778335331.3391042, "grad/layer_0/attn": 0.0029681960586458445, "grad/layer_0/mlp": 0.0031798328272998333, "grad/layer_0/attn_mlp_ratio": 0.9334440288239605, "grad/layer_4/attn": 0.0021043617743998766, "grad/layer_4/mlp": 0.0024520268198102713, "grad/layer_4/attn_mlp_ratio": 0.8582131612823828, "grad/layer_8/attn": 0.004446825943887234, "grad/layer_8/mlp": 0.003374686697497964, "grad/layer_8/attn_mlp_ratio": 1.3177003410165848, "grad/layer_12/attn": 0.007098023779690266, "grad/layer_12/mlp": 0.006264813710004091, "grad/layer_12/attn_mlp_ratio": 1.1329983611573016, "grad/layer_16/attn": 0.00305488845333457, "grad/layer_16/mlp": 0.0042450688779354095, "grad/layer_16/attn_mlp_ratio": 0.7196322295851871, "grad/layer_20/attn": 0.0033690042328089476, "grad/layer_20/mlp": 0.006075169425457716, "grad/layer_20/attn_mlp_ratio": 0.5545531229525846, "grad/layer_24/attn": 0.015232979319989681, "grad/layer_24/mlp": 0.013244652189314365, "grad/layer_24/attn_mlp_ratio": 1.1501230071762227, "grad/layer_27/attn": 0.006282550282776356, "grad/layer_27/mlp": 0.012828130275011063, "grad/layer_27/attn_mlp_ratio": 0.4897479288965317} {"step": 8900, "timestamp": 1778335331.354775, "train/loss": 2.3162967443466185, "train/z_loss": 0.0013620953541249037, "train/perplexity": 10.138060871740265, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017847.5355687174, "perf/iters_per_sec": 0.9621846845477664, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0393015146255493, "data/tokens_consumed": 18666749952, "data/tokens_consumed_B": 18.666749952, "train/loss_slope": -2.339021257548736e-06} {"step": 8910, "timestamp": 1778335341.7380512, "train/loss": 2.288729119300842, "train/z_loss": 0.0013536957674659788, "train/perplexity": 9.862395784783894, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020624.4913794377, "perf/iters_per_sec": 0.9635088402650059, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378731966018677, "data/tokens_consumed": 18687721472, "data/tokens_consumed_B": 18.687721472, "train/loss_slope": -6.966519649058126e-06} {"step": 8920, "timestamp": 1778335352.1290696, "train/loss": 2.328516697883606, "train/z_loss": 0.0013584701344370842, "train/perplexity": 10.262707541714235, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019665.3122564491, "perf/iters_per_sec": 0.9630514680177923, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383661031723022, "data/tokens_consumed": 18708692992, "data/tokens_consumed_B": 18.708692992, "train/loss_slope": -8.404430454642169e-06} {"step": 8925, "timestamp": 1778335357.9249818, "eos/sharpness": 41.04218482971191, "eos/L0_probe": 2.330761671066284, "eos/L_plus": 2.575143575668335, "eos/L_minus": 2.4968016147613525, "eos/grad_norm": 0.1277323216199875, "eos/embed_grad_frac": 0.15841630101203918, "eos/time_s": 0.6085057258605957} {"step": 8925, "timestamp": 1778335359.3044646, "geo/rankme_last": 428.28900146484375, "geo/layer_0/stable_rank_q_proj": 20.72673225402832, "geo/layer_0/stable_rank_k_proj": 17.176673889160156, "geo/layer_0/stable_rank_o_proj": 44.6997184753418, "geo/layer_0/stable_rank_gate_proj": 127.0961685180664, "geo/layer_0/stable_rank_down_proj": 56.81068420410156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06405438482761383, "geo/layer_0/attn_entropy_mean": 6.233715057373047, "geo/layer_0/attn_entropy_std": 0.4374123215675354, "geo/layer_7/stable_rank_q_proj": 42.19377517700195, "geo/layer_7/stable_rank_k_proj": 38.53734588623047, "geo/layer_7/stable_rank_o_proj": 89.38075256347656, "geo/layer_7/stable_rank_gate_proj": 79.09095764160156, "geo/layer_7/stable_rank_down_proj": 144.08213806152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39214053750038147, "geo/layer_7/attn_entropy_mean": 4.741359233856201, "geo/layer_7/attn_entropy_std": 0.7719224095344543, "geo/layer_14/stable_rank_q_proj": 51.634498596191406, "geo/layer_14/stable_rank_k_proj": 42.64168167114258, "geo/layer_14/stable_rank_o_proj": 42.405372619628906, "geo/layer_14/stable_rank_gate_proj": 71.8994369506836, "geo/layer_14/stable_rank_down_proj": 125.9263916015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3680972456932068, "geo/layer_14/attn_entropy_mean": 5.528517723083496, "geo/layer_14/attn_entropy_std": 0.46968621015548706, "geo/layer_21/stable_rank_q_proj": 39.265113830566406, "geo/layer_21/stable_rank_k_proj": 28.724409103393555, "geo/layer_21/stable_rank_o_proj": 65.32225799560547, "geo/layer_21/stable_rank_gate_proj": 61.22458267211914, "geo/layer_21/stable_rank_down_proj": 49.598243713378906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1429627537727356, "geo/layer_21/attn_entropy_mean": 5.847749710083008, "geo/layer_21/attn_entropy_std": 0.3219495415687561, "geo/layer_27/stable_rank_q_proj": 44.35652542114258, "geo/layer_27/stable_rank_k_proj": 30.069042205810547, "geo/layer_27/stable_rank_o_proj": 107.49269104003906, "geo/layer_27/stable_rank_gate_proj": 71.06233215332031, "geo/layer_27/stable_rank_down_proj": 129.09222412109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1097649484872818, "geo/layer_27/attn_entropy_mean": 4.282144069671631, "geo/layer_27/attn_entropy_std": 0.6960042715072632, "attnres/final_alpha/block_0": 0.2550278604030609, "attnres/block_norm/0": 1.7786247730255127, "attnres/final_alpha/block_1": 0.003940294496715069, "attnres/block_norm/1": 50403.484375, "attnres/final_alpha/block_2": 0.008537298999726772, "attnres/block_norm/2": 29861.546875, "attnres/final_alpha/block_3": 0.010607507079839706, "attnres/block_norm/3": 70177.8671875, "attnres/final_alpha/block_4": 0.011845956556499004, "attnres/block_norm/4": 17266.978515625, "attnres/final_alpha/block_5": 0.6095538139343262, "attnres/block_norm/5": 7181.76025390625, "attnres/final_alpha/block_6": 0.10048730671405792, "attnres/block_norm/6": 47666.2421875, "geo/tier1_time_s": 1.3597424030303955, "geo/step": 8925.0, "geo/rankme_slope": 0.00027988918223539416} {"step": 8930, "timestamp": 1778335364.4932418, "train/loss": 2.3513285636901857, "train/z_loss": 0.0013555844780057669, "train/perplexity": 10.499509729976728, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697091.2503558, "perf/iters_per_sec": 0.8092361690310478, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.235733199119568, "data/tokens_consumed": 18729664512, "data/tokens_consumed_B": 18.729664512, "train/loss_slope": -8.005263188061973e-06} {"step": 8940, "timestamp": 1778335374.862173, "train/loss": 2.370280885696411, "train/z_loss": 0.0013403963530436158, "train/perplexity": 10.700397450626944, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023827.53835712, "perf/iters_per_sec": 0.9650361720834351, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362305879592895, "data/tokens_consumed": 18750636032, "data/tokens_consumed_B": 18.750636032, "train/loss_slope": -7.3609803054127595e-06} {"step": 8950, "timestamp": 1778335385.202278, "grad/layer_0/attn": 0.002868011826649308, "grad/layer_0/mlp": 0.003114598337560892, "grad/layer_0/attn_mlp_ratio": 0.9208287630476399, "grad/layer_4/attn": 0.0023130704648792744, "grad/layer_4/mlp": 0.002635100157931447, "grad/layer_4/attn_mlp_ratio": 0.8777922046484236, "grad/layer_8/attn": 0.005802956875413656, "grad/layer_8/mlp": 0.003657572204247117, "grad/layer_8/attn_mlp_ratio": 1.586559715764293, "grad/layer_12/attn": 0.005656855646520853, "grad/layer_12/mlp": 0.007259010802954435, "grad/layer_12/attn_mlp_ratio": 0.7792873880680492, "grad/layer_16/attn": 0.004553910810500383, "grad/layer_16/mlp": 0.005214918404817581, "grad/layer_16/attn_mlp_ratio": 0.8732467834911412, "grad/layer_20/attn": 0.005682552233338356, "grad/layer_20/mlp": 0.00776366051286459, "grad/layer_20/attn_mlp_ratio": 0.7319423808817992, "grad/layer_24/attn": 0.02219262532889843, "grad/layer_24/mlp": 0.017238616943359375, "grad/layer_24/attn_mlp_ratio": 1.287378521901061, "grad/layer_27/attn": 0.006205251440405846, "grad/layer_27/mlp": 0.018164711073040962, "grad/layer_27/attn_mlp_ratio": 0.3416102453429223} {"step": 8950, "timestamp": 1778335385.2182307, "train/loss": 2.3285715579986572, "train/z_loss": 0.0013527508010156453, "train/perplexity": 10.263270570474479, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026291.5743785005, "perf/iters_per_sec": 0.9662111160175803, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034970498085022, "data/tokens_consumed": 18771607552, "data/tokens_consumed_B": 18.771607552, "train/loss_slope": -8.330165091628192e-06} {"step": 8960, "timestamp": 1778335395.586668, "train/loss": 2.321272611618042, "train/z_loss": 0.0013607032829895615, "train/perplexity": 10.188632230870653, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023771.6156488808, "perf/iters_per_sec": 0.9650095060581593, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362592220306397, "data/tokens_consumed": 18792579072, "data/tokens_consumed_B": 18.792579072, "train/loss_slope": -9.776530521895292e-06} {"step": 8970, "timestamp": 1778335405.944188, "train/loss": 2.315470838546753, "train/z_loss": 0.0013475404004566372, "train/perplexity": 10.129691245204212, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026177.9660194016, "perf/iters_per_sec": 0.9661569433304794, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350285291671752, "data/tokens_consumed": 18813550592, "data/tokens_consumed_B": 18.813550592, "train/loss_slope": -1.190320094211877e-05} {"step": 8980, "timestamp": 1778335416.2902708, "train/loss": 2.3872405767440794, "train/z_loss": 0.0013609023299068213, "train/perplexity": 10.88342050580588, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028262.9368839888, "perf/iters_per_sec": 0.9671511349124855, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339645624160767, "data/tokens_consumed": 18834522112, "data/tokens_consumed_B": 18.834522112, "train/loss_slope": -6.829130688909679e-06} {"step": 8990, "timestamp": 1778335426.6394124, "train/loss": 2.330825138092041, "train/z_loss": 0.0013536850223317743, "train/perplexity": 10.286425753955683, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027408.8258718785, "perf/iters_per_sec": 0.9667438630446809, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344001531600953, "data/tokens_consumed": 18855493632, "data/tokens_consumed_B": 18.855493632, "train/loss_slope": -7.415352116609716e-06} {"step": 9000, "timestamp": 1778335437.000065, "grad/layer_0/attn": 0.0025205682031810284, "grad/layer_0/mlp": 0.002961890073493123, "grad/layer_0/attn_mlp_ratio": 0.8509998870783183, "grad/layer_4/attn": 0.0019073333824053407, "grad/layer_4/mlp": 0.0026911236345767975, "grad/layer_4/attn_mlp_ratio": 0.708749790245258, "grad/layer_8/attn": 0.005147769581526518, "grad/layer_8/mlp": 0.003433992387726903, "grad/layer_8/attn_mlp_ratio": 1.4990625634519172, "grad/layer_12/attn": 0.010662213899195194, "grad/layer_12/mlp": 0.006964206695556641, "grad/layer_12/attn_mlp_ratio": 1.531001909075702, "grad/layer_16/attn": 0.008486112579703331, "grad/layer_16/mlp": 0.0047777192667126656, "grad/layer_16/attn_mlp_ratio": 1.776184812952345, "grad/layer_20/attn": 0.002562830923125148, "grad/layer_20/mlp": 0.005481217056512833, "grad/layer_20/attn_mlp_ratio": 0.4675660259291075, "grad/layer_24/attn": 0.005132364574819803, "grad/layer_24/mlp": 0.007601631339639425, "grad/layer_24/attn_mlp_ratio": 0.6751661949902759, "grad/layer_27/attn": 0.00422885874286294, "grad/layer_27/mlp": 0.007036253809928894, "grad/layer_27/attn_mlp_ratio": 0.6010099687982515} {"step": 9000, "timestamp": 1778335437.6025863, "eos/sharpness": 16.18545055389404, "eos/L0_probe": 2.3249404430389404, "eos/L_plus": 2.4199368953704834, "eos/L_minus": 2.391798496246338, "eos/grad_norm": 0.10023399442434311, "eos/embed_grad_frac": 0.25363585352897644, "eos/time_s": 0.5996055603027344} {"step": 9000, "timestamp": 1778335437.6238513, "train/loss": 2.3649279117584228, "train/z_loss": 0.0013488475233316422, "train/perplexity": 10.643271535124725, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910352.0575264425, "perf/iters_per_sec": 0.9109268462784016, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0977829933166503, "data/tokens_consumed": 18876465152, "data/tokens_consumed_B": 18.876465152, "train/loss_slope": -8.644967761584859e-06} {"step": 9000, "timestamp": 1778335438.9889426, "geo/rankme_last": 429.0843811035156, "geo/layer_0/stable_rank_q_proj": 20.76755142211914, "geo/layer_0/stable_rank_k_proj": 17.199373245239258, "geo/layer_0/stable_rank_o_proj": 44.779266357421875, "geo/layer_0/stable_rank_gate_proj": 127.20217895507812, "geo/layer_0/stable_rank_down_proj": 56.82611846923828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06336301565170288, "geo/layer_0/attn_entropy_mean": 6.232992172241211, "geo/layer_0/attn_entropy_std": 0.4376266896724701, "geo/layer_7/stable_rank_q_proj": 42.185726165771484, "geo/layer_7/stable_rank_k_proj": 38.642208099365234, "geo/layer_7/stable_rank_o_proj": 89.3580322265625, "geo/layer_7/stable_rank_gate_proj": 78.93850708007812, "geo/layer_7/stable_rank_down_proj": 143.88052368164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3962874114513397, "geo/layer_7/attn_entropy_mean": 4.739453315734863, "geo/layer_7/attn_entropy_std": 0.7617205381393433, "geo/layer_14/stable_rank_q_proj": 51.5262336730957, "geo/layer_14/stable_rank_k_proj": 42.55735397338867, "geo/layer_14/stable_rank_o_proj": 42.43647384643555, "geo/layer_14/stable_rank_gate_proj": 72.00093078613281, "geo/layer_14/stable_rank_down_proj": 125.73335266113281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3550370931625366, "geo/layer_14/attn_entropy_mean": 5.517023086547852, "geo/layer_14/attn_entropy_std": 0.43210867047309875, "geo/layer_21/stable_rank_q_proj": 39.294677734375, "geo/layer_21/stable_rank_k_proj": 28.71225929260254, "geo/layer_21/stable_rank_o_proj": 65.28714752197266, "geo/layer_21/stable_rank_gate_proj": 61.244632720947266, "geo/layer_21/stable_rank_down_proj": 49.62289810180664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13163432478904724, "geo/layer_21/attn_entropy_mean": 5.861213684082031, "geo/layer_21/attn_entropy_std": 0.33192649483680725, "geo/layer_27/stable_rank_q_proj": 44.286705017089844, "geo/layer_27/stable_rank_k_proj": 30.067583084106445, "geo/layer_27/stable_rank_o_proj": 107.52674102783203, "geo/layer_27/stable_rank_gate_proj": 71.09937286376953, "geo/layer_27/stable_rank_down_proj": 129.15692138671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09911137074232101, "geo/layer_27/attn_entropy_mean": 4.287698745727539, "geo/layer_27/attn_entropy_std": 0.6926984786987305, "attnres/final_alpha/block_0": 0.25514155626296997, "attnres/block_norm/0": 1.7787909507751465, "attnres/final_alpha/block_1": 0.0038775955326855183, "attnres/block_norm/1": 50387.64453125, "attnres/final_alpha/block_2": 0.008635809645056725, "attnres/block_norm/2": 29721.421875, "attnres/final_alpha/block_3": 0.010570992715656757, "attnres/block_norm/3": 71125.90625, "attnres/final_alpha/block_4": 0.012075642123818398, "attnres/block_norm/4": 17255.287109375, "attnres/final_alpha/block_5": 0.6102758049964905, "attnres/block_norm/5": 7076.701171875, "attnres/final_alpha/block_6": 0.09942261874675751, "attnres/block_norm/6": 47637.9296875, "geo/tier1_time_s": 1.3607463836669922, "geo/step": 9000.0, "geo/rankme_slope": 0.0002633817980317127} {"step": 9000, "timestamp": 1778335445.809735, "geo/ww_alpha_mean": 7.765172787043736, "geo/ww_alpha_std": 4.792334106397704, "geo/ww_alpha_min": 1.3555290303873062, "geo/ww_alpha_max": 30.82075362005443, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.8814635192448947, "geo/ww_alpha_by_type/k_proj": 4.423885060314443, "geo/ww_alpha_by_type/v_proj": 9.063573340338488, "geo/ww_alpha_by_type/o_proj": 8.805032289121977, "geo/ww_alpha_by_type/gate_proj": 7.780499185197398, "geo/ww_alpha_by_type/up_proj": 12.393257971728216, "geo/ww_alpha_by_type/down_proj": 8.108160514782588, "geo/twonn_id/layer_0": 0.7385921478271484, "geo/twonn_id/layer_7": 3.272836685180664, "geo/twonn_id/layer_14": 6.241809368133545, "geo/twonn_id/layer_21": 8.573368072509766, "geo/twonn_id/layer_27": 6.036157608032227, "geo/tier2_time_s": 6.8147289752960205} {"step": 9000, "timestamp": 1778335446.5662475, "eoc/jacobian_sigma/layer_0/attn": 1314.27734375, "eoc/jacobian_sigma/layer_0/mlp": 11048.6435546875, "eoc/jacobian_sigma/layer_0": 11048.6435546875, "eoc/jacobian_sigma/layer_7/attn": 1.1351078748703003, "eoc/jacobian_sigma/layer_7/mlp": 1.8299840688705444, "eoc/jacobian_sigma/layer_7": 1.8299840688705444, "eoc/jacobian_sigma/layer_14/attn": 2.145503044128418, "eoc/jacobian_sigma/layer_14/mlp": 14.531947135925293, "eoc/jacobian_sigma/layer_14": 14.531947135925293, "eoc/jacobian_sigma/layer_21/attn": 1.1000237464904785, "eoc/jacobian_sigma/layer_21/mlp": 6.234391689300537, "eoc/jacobian_sigma/layer_21": 6.234391689300537, "eoc/jacobian_sigma/layer_27/attn": 3.962207078933716, "eoc/jacobian_sigma/layer_27/mlp": 39.63835906982422, "eoc/jacobian_sigma/layer_27": 39.63835906982422, "eoc/layer0_sigma": 11048.6435546875, "eoc/sigma_max": 39.63835906982422, "eoc/sigma_min": 1.8299840688705444, "eoc/sigma_mean": 15.558670490980148, "eoc/time_s": 0.7501492500305176} {"step": 9010, "timestamp": 1778335456.9476774, "train/loss": 2.3626941680908202, "train/z_loss": 0.0013569869915954769, "train/perplexity": 10.619523727861973, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085500.9583563982, "perf/iters_per_sec": 0.5176071922094336, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9319669723510742, "data/tokens_consumed": 18897436672, "data/tokens_consumed_B": 18.897436672, "train/loss_slope": -8.849947695041152e-06} {"step": 9020, "timestamp": 1778335467.2979035, "train/loss": 2.3404395818710326, "train/z_loss": 0.001350344135425985, "train/perplexity": 10.385800969265405, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027686.9056328966, "perf/iters_per_sec": 0.9668764618076785, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342582941055298, "data/tokens_consumed": 18918408192, "data/tokens_consumed_B": 18.918408192, "train/loss_slope": -9.307142238233562e-06} {"step": 9030, "timestamp": 1778335477.6428628, "train/loss": 2.3112382888793945, "train/z_loss": 0.0013593733543530106, "train/perplexity": 10.086907430075911, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028443.6686575536, "perf/iters_per_sec": 0.9672373145377892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338724374771118, "data/tokens_consumed": 18939379712, "data/tokens_consumed_B": 18.939379712, "train/loss_slope": -9.632192915565392e-06} {"step": 9040, "timestamp": 1778335487.9890683, "train/loss": 2.3259973526000977, "train/z_loss": 0.0013547753682360054, "train/perplexity": 10.236884779758066, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027732.3403654282, "perf/iters_per_sec": 0.9668981267764226, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342351198196411, "data/tokens_consumed": 18960351232, "data/tokens_consumed_B": 18.960351232, "train/loss_slope": -1.1254530413673735e-05} {"step": 9050, "timestamp": 1778335498.3426254, "grad/layer_0/attn": 0.002996321301907301, "grad/layer_0/mlp": 0.0031807180494070053, "grad/layer_0/attn_mlp_ratio": 0.9420266622699398, "grad/layer_4/attn": 0.0018993347184732556, "grad/layer_4/mlp": 0.0024920504074543715, "grad/layer_4/attn_mlp_ratio": 0.7621573931956239, "grad/layer_8/attn": 0.0038256680127233267, "grad/layer_8/mlp": 0.0032053410541266203, "grad/layer_8/attn_mlp_ratio": 1.1935291217903858, "grad/layer_12/attn": 0.006582619156688452, "grad/layer_12/mlp": 0.007050593849271536, "grad/layer_12/attn_mlp_ratio": 0.9336261886657883, "grad/layer_16/attn": 0.003497791476547718, "grad/layer_16/mlp": 0.004906731192022562, "grad/layer_16/attn_mlp_ratio": 0.7128557217377036, "grad/layer_20/attn": 0.003162381472066045, "grad/layer_20/mlp": 0.0066552432253956795, "grad/layer_20/attn_mlp_ratio": 0.47517142761689024, "grad/layer_24/attn": 0.01047765463590622, "grad/layer_24/mlp": 0.011628522537648678, "grad/layer_24/attn_mlp_ratio": 0.9010305919673415, "grad/layer_27/attn": 0.004663972184062004, "grad/layer_27/mlp": 0.011668100021779537, "grad/layer_27/attn_mlp_ratio": 0.39971993172704173} {"step": 9050, "timestamp": 1778335498.358291, "train/loss": 2.318769693374634, "train/z_loss": 0.0013473216677084565, "train/perplexity": 10.16316280462674, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023822.4628131157, "perf/iters_per_sec": 0.9650337518754557, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362331867218018, "data/tokens_consumed": 18981322752, "data/tokens_consumed_B": 18.981322752, "train/loss_slope": -1.273293496608881e-05} {"step": 9060, "timestamp": 1778335508.702072, "train/loss": 2.322346043586731, "train/z_loss": 0.0013664241996593773, "train/perplexity": 10.199574906482669, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028785.1543873951, "perf/iters_per_sec": 0.9674001476227737, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336984157562257, "data/tokens_consumed": 19002294272, "data/tokens_consumed_B": 19.002294272, "train/loss_slope": -1.6570214841804346e-05} {"step": 9070, "timestamp": 1778335519.0437806, "train/loss": 2.3623377799987795, "train/z_loss": 0.0013621115474961698, "train/perplexity": 10.61573973038809, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028837.9852320333, "perf/iters_per_sec": 0.9674253393325964, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033671498298645, "data/tokens_consumed": 19023265792, "data/tokens_consumed_B": 19.023265792, "train/loss_slope": -1.9114777264278893e-05} {"step": 9075, "timestamp": 1778335524.8310025, "eos/sharpness": 39.70749378204345, "eos/L0_probe": 2.317028522491455, "eos/L_plus": 2.509950876235962, "eos/L_minus": 2.521181106567383, "eos/grad_norm": 0.1337902694940567, "eos/embed_grad_frac": 0.12653204798698425, "eos/time_s": 0.6093282699584961} {"step": 9075, "timestamp": 1778335526.2100003, "geo/rankme_last": 428.0375061035156, "geo/layer_0/stable_rank_q_proj": 20.765121459960938, "geo/layer_0/stable_rank_k_proj": 17.221601486206055, "geo/layer_0/stable_rank_o_proj": 44.68164825439453, "geo/layer_0/stable_rank_gate_proj": 127.0196533203125, "geo/layer_0/stable_rank_down_proj": 56.8654899597168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06317326426506042, "geo/layer_0/attn_entropy_mean": 6.234929084777832, "geo/layer_0/attn_entropy_std": 0.4343000054359436, "geo/layer_7/stable_rank_q_proj": 42.20420455932617, "geo/layer_7/stable_rank_k_proj": 38.5634880065918, "geo/layer_7/stable_rank_o_proj": 89.35646057128906, "geo/layer_7/stable_rank_gate_proj": 78.82037353515625, "geo/layer_7/stable_rank_down_proj": 143.7488250732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38797181844711304, "geo/layer_7/attn_entropy_mean": 4.701863765716553, "geo/layer_7/attn_entropy_std": 0.7501853704452515, "geo/layer_14/stable_rank_q_proj": 51.51988983154297, "geo/layer_14/stable_rank_k_proj": 42.62322235107422, "geo/layer_14/stable_rank_o_proj": 42.41188430786133, "geo/layer_14/stable_rank_gate_proj": 72.08854675292969, "geo/layer_14/stable_rank_down_proj": 125.77298736572266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37843048572540283, "geo/layer_14/attn_entropy_mean": 5.4884796142578125, "geo/layer_14/attn_entropy_std": 0.48000648617744446, "geo/layer_21/stable_rank_q_proj": 39.310359954833984, "geo/layer_21/stable_rank_k_proj": 28.746126174926758, "geo/layer_21/stable_rank_o_proj": 65.3019027709961, "geo/layer_21/stable_rank_gate_proj": 61.23854446411133, "geo/layer_21/stable_rank_down_proj": 49.63827896118164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13797514140605927, "geo/layer_21/attn_entropy_mean": 5.86126708984375, "geo/layer_21/attn_entropy_std": 0.3222029209136963, "geo/layer_27/stable_rank_q_proj": 44.25783157348633, "geo/layer_27/stable_rank_k_proj": 30.082054138183594, "geo/layer_27/stable_rank_o_proj": 107.6797866821289, "geo/layer_27/stable_rank_gate_proj": 71.10247802734375, "geo/layer_27/stable_rank_down_proj": 129.22743225097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09751413017511368, "geo/layer_27/attn_entropy_mean": 4.312450885772705, "geo/layer_27/attn_entropy_std": 0.6683202981948853, "attnres/final_alpha/block_0": 0.2564202547073364, "attnres/block_norm/0": 1.778846025466919, "attnres/final_alpha/block_1": 0.003961751703172922, "attnres/block_norm/1": 50447.109375, "attnres/final_alpha/block_2": 0.008560938760638237, "attnres/block_norm/2": 29721.658203125, "attnres/final_alpha/block_3": 0.010625551454722881, "attnres/block_norm/3": 71367.1015625, "attnres/final_alpha/block_4": 0.01221570372581482, "attnres/block_norm/4": 17235.52734375, "attnres/final_alpha/block_5": 0.6068712472915649, "attnres/block_norm/5": 7156.7958984375, "attnres/final_alpha/block_6": 0.10134457796812057, "attnres/block_norm/6": 47896.1328125, "geo/tier1_time_s": 1.35847806930542, "geo/step": 9075.0, "geo/rankme_slope": 0.00023093594859818927} {"step": 9080, "timestamp": 1778335531.3912308, "train/loss": 2.3400107622146606, "train/z_loss": 0.0013560347957536577, "train/perplexity": 10.381348288429383, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699623.3840214175, "perf/iters_per_sec": 0.8104435844523513, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2338921785354615, "data/tokens_consumed": 19044237312, "data/tokens_consumed_B": 19.044237312, "train/loss_slope": -2.0812774684527214e-05} {"step": 9090, "timestamp": 1778335541.7394164, "train/loss": 2.3943065881729124, "train/z_loss": 0.001333139289636165, "train/perplexity": 10.960595217079328, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027674.3787190495, "perf/iters_per_sec": 0.9668704885096786, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342646837234497, "data/tokens_consumed": 19065208832, "data/tokens_consumed_B": 19.065208832, "train/loss_slope": -1.6354336682790943e-05} {"step": 9100, "timestamp": 1778335552.0801141, "grad/layer_0/attn": 0.002745287260040641, "grad/layer_0/mlp": 0.002972905058413744, "grad/layer_0/attn_mlp_ratio": 0.923435869547028, "grad/layer_4/attn": 0.002129824599251151, "grad/layer_4/mlp": 0.002614166121929884, "grad/layer_4/attn_mlp_ratio": 0.8147242441526261, "grad/layer_8/attn": 0.004360169172286987, "grad/layer_8/mlp": 0.003415314480662346, "grad/layer_8/attn_mlp_ratio": 1.276652287603153, "grad/layer_12/attn": 0.0051882145926356316, "grad/layer_12/mlp": 0.007190655451267958, "grad/layer_12/attn_mlp_ratio": 0.7215217799885816, "grad/layer_16/attn": 0.004301474895328283, "grad/layer_16/mlp": 0.004555732477456331, "grad/layer_16/attn_mlp_ratio": 0.9441895067796076, "grad/layer_20/attn": 0.006040104664862156, "grad/layer_20/mlp": 0.0059760091826319695, "grad/layer_20/attn_mlp_ratio": 1.010725448907261, "grad/layer_24/attn": 0.004898846615105867, "grad/layer_24/mlp": 0.008263624273240566, "grad/layer_24/attn_mlp_ratio": 0.5928205825726313, "grad/layer_27/attn": 0.0049787722527980804, "grad/layer_27/mlp": 0.007486889138817787, "grad/layer_27/attn_mlp_ratio": 0.6649987857419206} {"step": 9100, "timestamp": 1778335552.0959237, "train/loss": 2.309524416923523, "train/z_loss": 0.0013635852956213057, "train/perplexity": 10.069634568274797, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025799.0990973313, "perf/iters_per_sec": 0.9659762855040223, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035222101211548, "data/tokens_consumed": 19086180352, "data/tokens_consumed_B": 19.086180352, "train/loss_slope": -1.7501937752903132e-05} {"step": 9110, "timestamp": 1778335562.4663732, "train/loss": 2.3789641141891478, "train/z_loss": 0.001334319985471666, "train/perplexity": 10.793716013517027, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023648.9317756617, "perf/iters_per_sec": 0.9649510058286961, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036322045326233, "data/tokens_consumed": 19107151872, "data/tokens_consumed_B": 19.107151872, "train/loss_slope": -1.429476152838371e-05} {"step": 9120, "timestamp": 1778335572.8137672, "train/loss": 2.303372049331665, "train/z_loss": 0.0013544562738388778, "train/perplexity": 10.00787266069001, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027965.1551979834, "perf/iters_per_sec": 0.9670091415395657, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341163873672485, "data/tokens_consumed": 19128123392, "data/tokens_consumed_B": 19.128123392, "train/loss_slope": -1.7433266148995015e-05} {"step": 9130, "timestamp": 1778335583.167567, "train/loss": 2.3605518102645875, "train/z_loss": 0.0013484462513588368, "train/perplexity": 10.596797260897205, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026398.2863541879, "perf/iters_per_sec": 0.9662620002528133, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349159955978393, "data/tokens_consumed": 19149094912, "data/tokens_consumed_B": 19.149094912, "train/loss_slope": -1.8496316923524364e-05} {"step": 9140, "timestamp": 1778335593.5291314, "train/loss": 2.380227541923523, "train/z_loss": 0.001361485302913934, "train/perplexity": 10.807361712047303, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025393.4640277023, "perf/iters_per_sec": 0.9657828636301529, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354294300079345, "data/tokens_consumed": 19170066432, "data/tokens_consumed_B": 19.170066432, "train/loss_slope": -1.5002572168074992e-05} {"step": 9150, "timestamp": 1778335603.8795686, "grad/layer_0/attn": 0.002861594781279564, "grad/layer_0/mlp": 0.0032153259962797165, "grad/layer_0/attn_mlp_ratio": 0.8899858663140151, "grad/layer_4/attn": 0.0019017572049051523, "grad/layer_4/mlp": 0.002585452049970627, "grad/layer_4/attn_mlp_ratio": 0.7355607818643087, "grad/layer_8/attn": 0.004807023331522942, "grad/layer_8/mlp": 0.0036169562954455614, "grad/layer_8/attn_mlp_ratio": 1.3290244078075981, "grad/layer_12/attn": 0.005671459715813398, "grad/layer_12/mlp": 0.0068049621768295765, "grad/layer_12/attn_mlp_ratio": 0.8334300007987296, "grad/layer_16/attn": 0.003878588555380702, "grad/layer_16/mlp": 0.004329107701778412, "grad/layer_16/attn_mlp_ratio": 0.8959325415244603, "grad/layer_20/attn": 0.0028447930235415697, "grad/layer_20/mlp": 0.006070998497307301, "grad/layer_20/attn_mlp_ratio": 0.46858733006516173, "grad/layer_24/attn": 0.011683095246553421, "grad/layer_24/mlp": 0.011463925242424011, "grad/layer_24/attn_mlp_ratio": 1.0191182250043394, "grad/layer_27/attn": 0.004939207341521978, "grad/layer_27/mlp": 0.0120219262316823, "grad/layer_27/attn_mlp_ratio": 0.410849909178474} {"step": 9150, "timestamp": 1778335604.4824333, "eos/sharpness": 60.144019126892076, "eos/L0_probe": 2.3155734539031982, "eos/L_plus": 2.608025550842285, "eos/L_minus": 2.6245615482330322, "eos/grad_norm": 0.1879950910806656, "eos/embed_grad_frac": 0.07263048738241196, "eos/time_s": 0.6000807285308838} {"step": 9150, "timestamp": 1778335604.5019271, "train/loss": 2.32452507019043, "train/z_loss": 0.0013638764386996627, "train/perplexity": 10.221824283739164, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912119.0222392785, "perf/iters_per_sec": 0.9117694007107155, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967685461044312, "data/tokens_consumed": 19191037952, "data/tokens_consumed_B": 19.191037952, "train/loss_slope": -1.6436203156295863e-05} {"step": 9150, "timestamp": 1778335605.8671389, "geo/rankme_last": 428.6396179199219, "geo/layer_0/stable_rank_q_proj": 20.753267288208008, "geo/layer_0/stable_rank_k_proj": 17.249164581298828, "geo/layer_0/stable_rank_o_proj": 44.665836334228516, "geo/layer_0/stable_rank_gate_proj": 126.67134857177734, "geo/layer_0/stable_rank_down_proj": 56.903507232666016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0676732063293457, "geo/layer_0/attn_entropy_mean": 6.2359209060668945, "geo/layer_0/attn_entropy_std": 0.4347585439682007, "geo/layer_7/stable_rank_q_proj": 42.23467254638672, "geo/layer_7/stable_rank_k_proj": 38.516178131103516, "geo/layer_7/stable_rank_o_proj": 89.22359466552734, "geo/layer_7/stable_rank_gate_proj": 78.8865737915039, "geo/layer_7/stable_rank_down_proj": 143.888671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40415096282958984, "geo/layer_7/attn_entropy_mean": 4.739546298980713, "geo/layer_7/attn_entropy_std": 0.7744154334068298, "geo/layer_14/stable_rank_q_proj": 51.58202362060547, "geo/layer_14/stable_rank_k_proj": 42.69342803955078, "geo/layer_14/stable_rank_o_proj": 42.4344367980957, "geo/layer_14/stable_rank_gate_proj": 71.93441009521484, "geo/layer_14/stable_rank_down_proj": 125.94105529785156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37127572298049927, "geo/layer_14/attn_entropy_mean": 5.520991325378418, "geo/layer_14/attn_entropy_std": 0.46318158507347107, "geo/layer_21/stable_rank_q_proj": 39.28754806518555, "geo/layer_21/stable_rank_k_proj": 28.81447982788086, "geo/layer_21/stable_rank_o_proj": 65.3430404663086, "geo/layer_21/stable_rank_gate_proj": 61.17484664916992, "geo/layer_21/stable_rank_down_proj": 49.585872650146484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13309194147586823, "geo/layer_21/attn_entropy_mean": 5.862576484680176, "geo/layer_21/attn_entropy_std": 0.32482343912124634, "geo/layer_27/stable_rank_q_proj": 44.20576858520508, "geo/layer_27/stable_rank_k_proj": 30.092853546142578, "geo/layer_27/stable_rank_o_proj": 107.76600646972656, "geo/layer_27/stable_rank_gate_proj": 71.0899658203125, "geo/layer_27/stable_rank_down_proj": 129.1520538330078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08972866833209991, "geo/layer_27/attn_entropy_mean": 4.296207427978516, "geo/layer_27/attn_entropy_std": 0.7013146877288818, "attnres/final_alpha/block_0": 0.2569434642791748, "attnres/block_norm/0": 1.7789113521575928, "attnres/final_alpha/block_1": 0.00394015945494175, "attnres/block_norm/1": 50281.9609375, "attnres/final_alpha/block_2": 0.008673778735101223, "attnres/block_norm/2": 29852.962890625, "attnres/final_alpha/block_3": 0.010621427558362484, "attnres/block_norm/3": 70751.1484375, "attnres/final_alpha/block_4": 0.012117106467485428, "attnres/block_norm/4": 17220.70703125, "attnres/final_alpha/block_5": 0.6068047285079956, "attnres/block_norm/5": 7161.5517578125, "attnres/final_alpha/block_6": 0.10089927911758423, "attnres/block_norm/6": 47584.234375, "geo/tier1_time_s": 1.3611204624176025, "geo/step": 9150.0, "geo/rankme_slope": 0.00022070208942952181} {"step": 9160, "timestamp": 1778335616.2170837, "train/loss": 2.3472111225128174, "train/z_loss": 0.0013603513827547431, "train/perplexity": 10.456367495030001, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790714.4786056874, "perf/iters_per_sec": 0.8538792031315267, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171125841140747, "data/tokens_consumed": 19212009472, "data/tokens_consumed_B": 19.212009472, "train/loss_slope": -1.6578884908754545e-05} {"step": 9170, "timestamp": 1778335626.5709054, "train/loss": 2.3540746927261353, "train/z_loss": 0.0013541749096475542, "train/perplexity": 10.528382364354675, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026822.444514191, "perf/iters_per_sec": 0.9664642546244578, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346994161605836, "data/tokens_consumed": 19232980992, "data/tokens_consumed_B": 19.232980992, "train/loss_slope": -1.2165208911523785e-05} {"step": 9180, "timestamp": 1778335636.9194536, "train/loss": 2.3259836196899415, "train/z_loss": 0.0013397179543972015, "train/perplexity": 10.236744198504404, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027331.6313718555, "perf/iters_per_sec": 0.966707053838661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344395399093629, "data/tokens_consumed": 19253952512, "data/tokens_consumed_B": 19.253952512, "train/loss_slope": -1.2605399968135555e-05} {"step": 9190, "timestamp": 1778335647.2753696, "train/loss": 2.340241241455078, "train/z_loss": 0.0013435171567834914, "train/perplexity": 10.383741249450734, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026483.2531817306, "perf/iters_per_sec": 0.9663025155934003, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348726034164428, "data/tokens_consumed": 19274924032, "data/tokens_consumed_B": 19.274924032, "train/loss_slope": -1.2675186438207144e-05} {"step": 9200, "timestamp": 1778335657.6139345, "grad/layer_0/attn": 0.003020112868398428, "grad/layer_0/mlp": 0.0032191781792789698, "grad/layer_0/attn_mlp_ratio": 0.9381626633846671, "grad/layer_4/attn": 0.00257602846249938, "grad/layer_4/mlp": 0.0026306048966944218, "grad/layer_4/attn_mlp_ratio": 0.9792532386034308, "grad/layer_8/attn": 0.004749544896185398, "grad/layer_8/mlp": 0.00347872800193727, "grad/layer_8/attn_mlp_ratio": 1.365310756405602, "grad/layer_12/attn": 0.007542252074927092, "grad/layer_12/mlp": 0.006788581609725952, "grad/layer_12/attn_mlp_ratio": 1.1110202981163741, "grad/layer_16/attn": 0.003056173212826252, "grad/layer_16/mlp": 0.00438230624422431, "grad/layer_16/attn_mlp_ratio": 0.6973892221966993, "grad/layer_20/attn": 0.003016501897946, "grad/layer_20/mlp": 0.0056365723721683025, "grad/layer_20/attn_mlp_ratio": 0.5351659918932256, "grad/layer_24/attn": 0.009742158465087414, "grad/layer_24/mlp": 0.009795974008738995, "grad/layer_24/attn_mlp_ratio": 0.9945063509708982, "grad/layer_27/attn": 0.004514402709901333, "grad/layer_27/mlp": 0.009627935476601124, "grad/layer_27/attn_mlp_ratio": 0.4688858451517618} {"step": 9200, "timestamp": 1778335657.6296487, "train/loss": 2.3748783826828004, "train/z_loss": 0.0013439075439237059, "train/perplexity": 10.749705756200155, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026840.1916795403, "perf/iters_per_sec": 0.9664727171323492, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346903562545777, "data/tokens_consumed": 19295895552, "data/tokens_consumed_B": 19.295895552, "train/loss_slope": -1.0228381556074016e-05} {"step": 9210, "timestamp": 1778335667.9817715, "train/loss": 2.3360449075698853, "train/z_loss": 0.0013598741847090424, "train/perplexity": 10.34025890122372, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026772.7074042137, "perf/iters_per_sec": 0.966440538122279, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034724807739258, "data/tokens_consumed": 19316867072, "data/tokens_consumed_B": 19.316867072, "train/loss_slope": -7.94638222557914e-06} {"step": 9220, "timestamp": 1778335678.338885, "train/loss": 2.340774154663086, "train/z_loss": 0.0013623442267999054, "train/perplexity": 10.38927635704608, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026059.0502918693, "perf/iters_per_sec": 0.9661002398928973, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350892782211303, "data/tokens_consumed": 19337838592, "data/tokens_consumed_B": 19.337838592, "train/loss_slope": -8.310177746099501e-06} {"step": 9225, "timestamp": 1778335684.0973766, "eos/sharpness": 42.103815078735344, "eos/L0_probe": 2.318350315093994, "eos/L_plus": 2.5697457790374756, "eos/L_minus": 2.487993001937866, "eos/grad_norm": 0.13276377320289612, "eos/embed_grad_frac": 0.12322145700454712, "eos/time_s": 0.5956354141235352} {"step": 9225, "timestamp": 1778335685.479204, "geo/rankme_last": 429.7840881347656, "geo/layer_0/stable_rank_q_proj": 20.763011932373047, "geo/layer_0/stable_rank_k_proj": 17.235097885131836, "geo/layer_0/stable_rank_o_proj": 44.74837875366211, "geo/layer_0/stable_rank_gate_proj": 126.39437103271484, "geo/layer_0/stable_rank_down_proj": 57.01087188720703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06455990672111511, "geo/layer_0/attn_entropy_mean": 6.233559608459473, "geo/layer_0/attn_entropy_std": 0.4334290027618408, "geo/layer_7/stable_rank_q_proj": 42.21360778808594, "geo/layer_7/stable_rank_k_proj": 38.59952926635742, "geo/layer_7/stable_rank_o_proj": 89.0659408569336, "geo/layer_7/stable_rank_gate_proj": 79.01912689208984, "geo/layer_7/stable_rank_down_proj": 144.07974243164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3988296687602997, "geo/layer_7/attn_entropy_mean": 4.706450462341309, "geo/layer_7/attn_entropy_std": 0.7573585510253906, "geo/layer_14/stable_rank_q_proj": 51.560340881347656, "geo/layer_14/stable_rank_k_proj": 42.602840423583984, "geo/layer_14/stable_rank_o_proj": 42.404075622558594, "geo/layer_14/stable_rank_gate_proj": 71.864990234375, "geo/layer_14/stable_rank_down_proj": 126.20913696289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3684633672237396, "geo/layer_14/attn_entropy_mean": 5.515512943267822, "geo/layer_14/attn_entropy_std": 0.4685426354408264, "geo/layer_21/stable_rank_q_proj": 39.25619888305664, "geo/layer_21/stable_rank_k_proj": 28.82179832458496, "geo/layer_21/stable_rank_o_proj": 65.415771484375, "geo/layer_21/stable_rank_gate_proj": 61.13193893432617, "geo/layer_21/stable_rank_down_proj": 49.61720657348633, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13413871824741364, "geo/layer_21/attn_entropy_mean": 5.831220626831055, "geo/layer_21/attn_entropy_std": 0.3281888961791992, "geo/layer_27/stable_rank_q_proj": 44.15823745727539, "geo/layer_27/stable_rank_k_proj": 30.184541702270508, "geo/layer_27/stable_rank_o_proj": 107.77587890625, "geo/layer_27/stable_rank_gate_proj": 71.11795043945312, "geo/layer_27/stable_rank_down_proj": 128.8700714111328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08680409938097, "geo/layer_27/attn_entropy_mean": 4.287549018859863, "geo/layer_27/attn_entropy_std": 0.7008694410324097, "attnres/final_alpha/block_0": 0.2549847960472107, "attnres/block_norm/0": 1.778954267501831, "attnres/final_alpha/block_1": 0.0038742912001907825, "attnres/block_norm/1": 50365.4296875, "attnres/final_alpha/block_2": 0.00858709029853344, "attnres/block_norm/2": 29694.4921875, "attnres/final_alpha/block_3": 0.010409442707896233, "attnres/block_norm/3": 71730.6484375, "attnres/final_alpha/block_4": 0.011892544105648994, "attnres/block_norm/4": 17203.845703125, "attnres/final_alpha/block_5": 0.6106785535812378, "attnres/block_norm/5": 7128.9697265625, "attnres/final_alpha/block_6": 0.09957325458526611, "attnres/block_norm/6": 47597.140625, "geo/tier1_time_s": 1.3620054721832275, "geo/step": 9225.0, "geo/rankme_slope": 0.0002512306094312725} {"step": 9230, "timestamp": 1778335691.2093534, "train/loss": 2.3364519357681273, "train/z_loss": 0.0013584189815446734, "train/perplexity": 10.344468534835316, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1630226.1455610122, "perf/iters_per_sec": 0.7773524024777471, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.286417841911316, "data/tokens_consumed": 19358810112, "data/tokens_consumed_B": 19.358810112, "train/loss_slope": -7.760496866298885e-06} {"step": 9240, "timestamp": 1778335701.5674322, "train/loss": 2.378354811668396, "train/z_loss": 0.0013568530441261828, "train/perplexity": 10.787141378316015, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025648.5529577024, "perf/iters_per_sec": 0.9659044995106232, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035299038887024, "data/tokens_consumed": 19379781632, "data/tokens_consumed_B": 19.379781632, "train/loss_slope": -6.3472168816842134e-06} {"step": 9250, "timestamp": 1778335711.9059365, "grad/layer_0/attn": 0.0029554618522524834, "grad/layer_0/mlp": 0.0033547228667885065, "grad/layer_0/attn_mlp_ratio": 0.8809853694362614, "grad/layer_4/attn": 0.0027899600099772215, "grad/layer_4/mlp": 0.00253321067430079, "grad/layer_4/attn_mlp_ratio": 1.1013532858304296, "grad/layer_8/attn": 0.0032398856710642576, "grad/layer_8/mlp": 0.00338775385171175, "grad/layer_8/attn_mlp_ratio": 0.9563521192048855, "grad/layer_12/attn": 0.007095897104591131, "grad/layer_12/mlp": 0.006695196498185396, "grad/layer_12/attn_mlp_ratio": 1.0598489529813564, "grad/layer_16/attn": 0.0034287888556718826, "grad/layer_16/mlp": 0.004729179665446281, "grad/layer_16/attn_mlp_ratio": 0.7250282344359807, "grad/layer_20/attn": 0.0029027878772467375, "grad/layer_20/mlp": 0.005735729821026325, "grad/layer_20/attn_mlp_ratio": 0.5060886612888715, "grad/layer_24/attn": 0.011971518397331238, "grad/layer_24/mlp": 0.008457518182694912, "grad/layer_24/attn_mlp_ratio": 1.4154883261472089, "grad/layer_27/attn": 0.0067383162677288055, "grad/layer_27/mlp": 0.007823624648153782, "grad/layer_27/attn_mlp_ratio": 0.8612780500903897} {"step": 9250, "timestamp": 1778335711.9216733, "train/loss": 2.353239130973816, "train/z_loss": 0.0013462014612741768, "train/perplexity": 10.519588924979638, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026778.544966738, "perf/iters_per_sec": 0.9664433216890039, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347218275070191, "data/tokens_consumed": 19400753152, "data/tokens_consumed_B": 19.400753152, "train/loss_slope": -4.859653107225085e-06} {"step": 9260, "timestamp": 1778335722.2744513, "train/loss": 2.3603820323944094, "train/z_loss": 0.0013446472818031906, "train/perplexity": 10.594998311942721, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027233.8383068102, "perf/iters_per_sec": 0.9666604224714328, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344894409179688, "data/tokens_consumed": 19421724672, "data/tokens_consumed_B": 19.421724672, "train/loss_slope": -3.6712290394459934e-06} {"step": 9270, "timestamp": 1778335732.6310732, "train/loss": 2.3617564916610716, "train/z_loss": 0.0013548416434787213, "train/perplexity": 10.609570717847902, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025901.2331559875, "perf/iters_per_sec": 0.9660249868183076, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351699113845825, "data/tokens_consumed": 19442696192, "data/tokens_consumed_B": 19.442696192, "train/loss_slope": -3.0255488508140436e-06} {"step": 9280, "timestamp": 1778335742.9780889, "train/loss": 2.2814971446990966, "train/z_loss": 0.0013528121053241193, "train/perplexity": 9.791328477181144, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028057.5947067058, "perf/iters_per_sec": 0.9670532201322106, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340692520141601, "data/tokens_consumed": 19463667712, "data/tokens_consumed_B": 19.463667712, "train/loss_slope": -9.32682977579681e-06} {"step": 9290, "timestamp": 1778335753.3376892, "train/loss": 2.394421362876892, "train/z_loss": 0.0013438466819934547, "train/perplexity": 10.961853288346806, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025610.954904035, "perf/iters_per_sec": 0.9658865713615584, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353182554244995, "data/tokens_consumed": 19484639232, "data/tokens_consumed_B": 19.484639232, "train/loss_slope": -8.43564916794416e-06} {"step": 9300, "timestamp": 1778335763.6866827, "grad/layer_0/attn": 0.0029766964726150036, "grad/layer_0/mlp": 0.0034957476891577244, "grad/layer_0/attn_mlp_ratio": 0.8515192319787513, "grad/layer_4/attn": 0.001914985477924347, "grad/layer_4/mlp": 0.002666689455509186, "grad/layer_4/attn_mlp_ratio": 0.718113390427516, "grad/layer_8/attn": 0.004501234740018845, "grad/layer_8/mlp": 0.0036984949838370085, "grad/layer_8/attn_mlp_ratio": 1.217044943412236, "grad/layer_12/attn": 0.006555067840963602, "grad/layer_12/mlp": 0.0067060478031635284, "grad/layer_12/attn_mlp_ratio": 0.977485985131615, "grad/layer_16/attn": 0.003602029522880912, "grad/layer_16/mlp": 0.004662222694605589, "grad/layer_16/attn_mlp_ratio": 0.7725991831725909, "grad/layer_20/attn": 0.0035802070051431656, "grad/layer_20/mlp": 0.006508656311780214, "grad/layer_20/attn_mlp_ratio": 0.5500685208491327, "grad/layer_24/attn": 0.005458996165543795, "grad/layer_24/mlp": 0.007589008193463087, "grad/layer_24/attn_mlp_ratio": 0.7193293187261356, "grad/layer_27/attn": 0.005443756002932787, "grad/layer_27/mlp": 0.0072990357875823975, "grad/layer_27/attn_mlp_ratio": 0.7458185007959839} {"step": 9300, "timestamp": 1778335764.2820916, "eos/sharpness": 4.6007394790649405, "eos/L0_probe": 2.320168972015381, "eos/L_plus": 2.3480429649353027, "eos/L_minus": 2.3383023738861084, "eos/grad_norm": 0.10198155790567398, "eos/embed_grad_frac": 0.25275784730911255, "eos/time_s": 0.592491626739502} {"step": 9300, "timestamp": 1778335764.3026817, "train/loss": 2.302180457115173, "train/z_loss": 0.0013717235415242612, "train/perplexity": 9.99595445975184, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913906.7073056393, "perf/iters_per_sec": 0.9126218353775212, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0957441091537476, "data/tokens_consumed": 19505610752, "data/tokens_consumed_B": 19.505610752, "train/loss_slope": -8.59561099542574e-06} {"step": 9300, "timestamp": 1778335765.664595, "geo/rankme_last": 429.1210021972656, "geo/layer_0/stable_rank_q_proj": 20.75737190246582, "geo/layer_0/stable_rank_k_proj": 17.22046661376953, "geo/layer_0/stable_rank_o_proj": 44.7485237121582, "geo/layer_0/stable_rank_gate_proj": 126.48247528076172, "geo/layer_0/stable_rank_down_proj": 56.97039794921875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06496866792440414, "geo/layer_0/attn_entropy_mean": 6.23630952835083, "geo/layer_0/attn_entropy_std": 0.44157618284225464, "geo/layer_7/stable_rank_q_proj": 42.22869873046875, "geo/layer_7/stable_rank_k_proj": 38.705013275146484, "geo/layer_7/stable_rank_o_proj": 89.0632095336914, "geo/layer_7/stable_rank_gate_proj": 78.90559387207031, "geo/layer_7/stable_rank_down_proj": 143.78955078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4021088182926178, "geo/layer_7/attn_entropy_mean": 4.727684020996094, "geo/layer_7/attn_entropy_std": 0.7875814437866211, "geo/layer_14/stable_rank_q_proj": 51.69157791137695, "geo/layer_14/stable_rank_k_proj": 42.482872009277344, "geo/layer_14/stable_rank_o_proj": 42.38786697387695, "geo/layer_14/stable_rank_gate_proj": 72.00949096679688, "geo/layer_14/stable_rank_down_proj": 126.02253723144531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38177961111068726, "geo/layer_14/attn_entropy_mean": 5.512162685394287, "geo/layer_14/attn_entropy_std": 0.452282577753067, "geo/layer_21/stable_rank_q_proj": 39.22690963745117, "geo/layer_21/stable_rank_k_proj": 28.810916900634766, "geo/layer_21/stable_rank_o_proj": 65.40396118164062, "geo/layer_21/stable_rank_gate_proj": 61.078216552734375, "geo/layer_21/stable_rank_down_proj": 49.58182144165039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13134783506393433, "geo/layer_21/attn_entropy_mean": 5.84883975982666, "geo/layer_21/attn_entropy_std": 0.32537323236465454, "geo/layer_27/stable_rank_q_proj": 44.21530532836914, "geo/layer_27/stable_rank_k_proj": 30.17457389831543, "geo/layer_27/stable_rank_o_proj": 107.73795318603516, "geo/layer_27/stable_rank_gate_proj": 71.05293273925781, "geo/layer_27/stable_rank_down_proj": 129.13986206054688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09628626704216003, "geo/layer_27/attn_entropy_mean": 4.335066318511963, "geo/layer_27/attn_entropy_std": 0.7041152715682983, "attnres/final_alpha/block_0": 0.2561602294445038, "attnres/block_norm/0": 1.7791156768798828, "attnres/final_alpha/block_1": 0.003920787945389748, "attnres/block_norm/1": 50405.9140625, "attnres/final_alpha/block_2": 0.008493704721331596, "attnres/block_norm/2": 29919.73828125, "attnres/final_alpha/block_3": 0.010661719366908073, "attnres/block_norm/3": 71175.375, "attnres/final_alpha/block_4": 0.011973539367318153, "attnres/block_norm/4": 17176.9375, "attnres/final_alpha/block_5": 0.6086980700492859, "attnres/block_norm/5": 7112.462890625, "attnres/final_alpha/block_6": 0.10009193420410156, "attnres/block_norm/6": 47589.18359375, "geo/tier1_time_s": 1.3579037189483643, "geo/step": 9300.0, "geo/rankme_slope": 0.00025231223348714485} {"step": 9310, "timestamp": 1778335776.0170157, "train/loss": 2.3227106094360352, "train/z_loss": 0.001354590035043657, "train/perplexity": 10.203294001057236, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790810.6168798893, "perf/iters_per_sec": 0.8539250454329916, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171062970161438, "data/tokens_consumed": 19526582272, "data/tokens_consumed_B": 19.526582272, "train/loss_slope": -8.274765870179862e-06} {"step": 9320, "timestamp": 1778335786.3687568, "train/loss": 2.315945625305176, "train/z_loss": 0.0013601624174043535, "train/perplexity": 10.134501830385052, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027199.965699291, "perf/iters_per_sec": 0.9666442707535224, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345067262649537, "data/tokens_consumed": 19547553792, "data/tokens_consumed_B": 19.547553792, "train/loss_slope": -8.146566733776058e-06} {"step": 9330, "timestamp": 1778335796.7454202, "train/loss": 2.321803069114685, "train/z_loss": 0.0013571737217716873, "train/perplexity": 10.194038300936489, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022308.852268636, "perf/iters_per_sec": 0.9643120061247997, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370087623596191, "data/tokens_consumed": 19568525312, "data/tokens_consumed_B": 19.568525312, "train/loss_slope": -7.773422372258918e-06} {"step": 9340, "timestamp": 1778335807.0985327, "train/loss": 2.389759063720703, "train/z_loss": 0.0013551618438214063, "train/perplexity": 10.910864803157517, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026654.9359258022, "perf/iters_per_sec": 0.9663843803051959, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347849369049071, "data/tokens_consumed": 19589496832, "data/tokens_consumed_B": 19.589496832, "train/loss_slope": -6.178403995146549e-06} {"step": 9350, "timestamp": 1778335817.4352634, "grad/layer_0/attn": 0.004041958600282669, "grad/layer_0/mlp": 0.0041019367054104805, "grad/layer_0/attn_mlp_ratio": 0.9853780767542056, "grad/layer_4/attn": 0.0022436215076595545, "grad/layer_4/mlp": 0.002748881233856082, "grad/layer_4/attn_mlp_ratio": 0.8161943842487526, "grad/layer_8/attn": 0.0038940596859902143, "grad/layer_8/mlp": 0.0035192822106182575, "grad/layer_8/attn_mlp_ratio": 1.106492557940348, "grad/layer_12/attn": 0.008600522764027119, "grad/layer_12/mlp": 0.007436472922563553, "grad/layer_12/attn_mlp_ratio": 1.1565325037731782, "grad/layer_16/attn": 0.003408904653042555, "grad/layer_16/mlp": 0.00435474282130599, "grad/layer_16/attn_mlp_ratio": 0.7828027313309743, "grad/layer_20/attn": 0.003986608237028122, "grad/layer_20/mlp": 0.00612091226503253, "grad/layer_20/attn_mlp_ratio": 0.6513094779469099, "grad/layer_24/attn": 0.007848798297345638, "grad/layer_24/mlp": 0.010579797439277172, "grad/layer_24/attn_mlp_ratio": 0.7418665875417008, "grad/layer_27/attn": 0.008010615594685078, "grad/layer_27/mlp": 0.01059505995362997, "grad/layer_27/attn_mlp_ratio": 0.7560708060300766} {"step": 9350, "timestamp": 1778335817.4510581, "train/loss": 2.3322638273239136, "train/z_loss": 0.001359304296784103, "train/perplexity": 10.301235374588778, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027007.2170713497, "perf/iters_per_sec": 0.9665523610455273, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346050977706909, "data/tokens_consumed": 19610468352, "data/tokens_consumed_B": 19.610468352, "train/loss_slope": -7.360215177058202e-06} {"step": 9360, "timestamp": 1778335827.7986183, "train/loss": 2.3576951742172243, "train/z_loss": 0.0013589135953225196, "train/perplexity": 10.566569263604444, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027824.057417706, "perf/iters_per_sec": 0.9669418608749895, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341883420944213, "data/tokens_consumed": 19631439872, "data/tokens_consumed_B": 19.631439872, "train/loss_slope": -2.8580137104592053e-06} {"step": 9370, "timestamp": 1778335838.1528702, "train/loss": 2.3559266090393067, "train/z_loss": 0.0013561444939114153, "train/perplexity": 10.547898112595204, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026327.1437882523, "perf/iters_per_sec": 0.9662280768338453, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349523305892945, "data/tokens_consumed": 19652411392, "data/tokens_consumed_B": 19.652411392, "train/loss_slope": -9.385468900436273e-07} {"step": 9375, "timestamp": 1778335843.9157126, "eos/sharpness": 26.661396026611325, "eos/L0_probe": 2.324810028076172, "eos/L_plus": 2.455812454223633, "eos/L_minus": 2.460421562194824, "eos/grad_norm": 0.10986895114183426, "eos/embed_grad_frac": 0.18453428149223328, "eos/time_s": 0.5961513519287109} {"step": 9375, "timestamp": 1778335845.3018193, "geo/rankme_last": 428.4307556152344, "geo/layer_0/stable_rank_q_proj": 20.76374053955078, "geo/layer_0/stable_rank_k_proj": 17.245084762573242, "geo/layer_0/stable_rank_o_proj": 44.721981048583984, "geo/layer_0/stable_rank_gate_proj": 126.39390563964844, "geo/layer_0/stable_rank_down_proj": 57.049503326416016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06335949152708054, "geo/layer_0/attn_entropy_mean": 6.235758304595947, "geo/layer_0/attn_entropy_std": 0.4396779239177704, "geo/layer_7/stable_rank_q_proj": 42.26149368286133, "geo/layer_7/stable_rank_k_proj": 38.67692184448242, "geo/layer_7/stable_rank_o_proj": 89.04895782470703, "geo/layer_7/stable_rank_gate_proj": 78.79308319091797, "geo/layer_7/stable_rank_down_proj": 143.5935821533203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38915741443634033, "geo/layer_7/attn_entropy_mean": 4.736352920532227, "geo/layer_7/attn_entropy_std": 0.7617122530937195, "geo/layer_14/stable_rank_q_proj": 51.72764587402344, "geo/layer_14/stable_rank_k_proj": 42.546104431152344, "geo/layer_14/stable_rank_o_proj": 42.40211486816406, "geo/layer_14/stable_rank_gate_proj": 72.0443115234375, "geo/layer_14/stable_rank_down_proj": 126.14820861816406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3809068500995636, "geo/layer_14/attn_entropy_mean": 5.502444267272949, "geo/layer_14/attn_entropy_std": 0.4496839940547943, "geo/layer_21/stable_rank_q_proj": 39.2623176574707, "geo/layer_21/stable_rank_k_proj": 28.82207489013672, "geo/layer_21/stable_rank_o_proj": 65.298828125, "geo/layer_21/stable_rank_gate_proj": 61.04827880859375, "geo/layer_21/stable_rank_down_proj": 49.58931350708008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13560093939304352, "geo/layer_21/attn_entropy_mean": 5.855371475219727, "geo/layer_21/attn_entropy_std": 0.32403072714805603, "geo/layer_27/stable_rank_q_proj": 44.344993591308594, "geo/layer_27/stable_rank_k_proj": 30.151277542114258, "geo/layer_27/stable_rank_o_proj": 107.9114990234375, "geo/layer_27/stable_rank_gate_proj": 71.0531005859375, "geo/layer_27/stable_rank_down_proj": 129.25787353515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10649939626455307, "geo/layer_27/attn_entropy_mean": 4.324430465698242, "geo/layer_27/attn_entropy_std": 0.717848539352417, "attnres/final_alpha/block_0": 0.25665515661239624, "attnres/block_norm/0": 1.7791662216186523, "attnres/final_alpha/block_1": 0.0039043943397700787, "attnres/block_norm/1": 50420.0859375, "attnres/final_alpha/block_2": 0.008534375578165054, "attnres/block_norm/2": 29908.2734375, "attnres/final_alpha/block_3": 0.010644602589309216, "attnres/block_norm/3": 71028.53125, "attnres/final_alpha/block_4": 0.01189972274005413, "attnres/block_norm/4": 17237.212890625, "attnres/final_alpha/block_5": 0.6078823804855347, "attnres/block_norm/5": 7150.6689453125, "attnres/final_alpha/block_6": 0.10047933459281921, "attnres/block_norm/6": 47646.0234375, "geo/tier1_time_s": 1.358506679534912, "geo/step": 9375.0, "geo/rankme_slope": 0.0002277156174969988} {"step": 9380, "timestamp": 1778335850.4823205, "train/loss": 2.3255543231964113, "train/z_loss": 0.001357039320282638, "train/perplexity": 10.23235054327269, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702125.502332251, "perf/iters_per_sec": 0.8116366874371772, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2320783615112305, "data/tokens_consumed": 19673382912, "data/tokens_consumed_B": 19.673382912, "train/loss_slope": -9.103476160680679e-07} {"step": 9390, "timestamp": 1778335860.8407395, "train/loss": 2.3920571327209474, "train/z_loss": 0.0013514750986360013, "train/perplexity": 10.93596755620926, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025560.8107886915, "perf/iters_per_sec": 0.9658626607840974, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035343885421753, "data/tokens_consumed": 19694354432, "data/tokens_consumed_B": 19.694354432, "train/loss_slope": 2.645825867605688e-06} {"step": 9400, "timestamp": 1778335871.188689, "grad/layer_0/attn": 0.002810191363096237, "grad/layer_0/mlp": 0.0030574023257941008, "grad/layer_0/attn_mlp_ratio": 0.9191434334544117, "grad/layer_4/attn": 0.002685325453057885, "grad/layer_4/mlp": 0.00257881754077971, "grad/layer_4/attn_mlp_ratio": 1.04130102516515, "grad/layer_8/attn": 0.0034928428940474987, "grad/layer_8/mlp": 0.0034715384244918823, "grad/layer_8/attn_mlp_ratio": 1.0061368668114476, "grad/layer_12/attn": 0.007861150428652763, "grad/layer_12/mlp": 0.006699938792735338, "grad/layer_12/attn_mlp_ratio": 1.1733167353476184, "grad/layer_16/attn": 0.003601278644055128, "grad/layer_16/mlp": 0.004951082170009613, "grad/layer_16/attn_mlp_ratio": 0.7273720062923003, "grad/layer_20/attn": 0.0037004053592681885, "grad/layer_20/mlp": 0.006195827387273312, "grad/layer_20/attn_mlp_ratio": 0.5972415092042339, "grad/layer_24/attn": 0.01093100942671299, "grad/layer_24/mlp": 0.01007628720253706, "grad/layer_24/attn_mlp_ratio": 1.08482510457604, "grad/layer_27/attn": 0.00940537080168724, "grad/layer_27/mlp": 0.008300157263875008, "grad/layer_27/attn_mlp_ratio": 1.1331557209532537} {"step": 9400, "timestamp": 1778335871.2047002, "train/loss": 2.3276852130889893, "train/z_loss": 0.0013697913964278996, "train/perplexity": 10.25417780310711, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024879.1904399458, "perf/iters_per_sec": 0.9655376388740281, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356924057006835, "data/tokens_consumed": 19715325952, "data/tokens_consumed_B": 19.715325952, "train/loss_slope": 1.2035286465887243e-06} {"step": 9410, "timestamp": 1778335881.554503, "train/loss": 2.3266390323638917, "train/z_loss": 0.001349052065052092, "train/perplexity": 10.243455689549982, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027308.1751277961, "perf/iters_per_sec": 0.9666958690299016, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344515085220336, "data/tokens_consumed": 19736297472, "data/tokens_consumed_B": 19.736297472, "train/loss_slope": -5.623585677812005e-07} {"step": 9420, "timestamp": 1778335891.9042761, "train/loss": 2.3365843296051025, "train/z_loss": 0.001351998990867287, "train/perplexity": 10.345838169379698, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027321.4918531394, "perf/iters_per_sec": 0.9667022189393708, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344447135925292, "data/tokens_consumed": 19757268992, "data/tokens_consumed_B": 19.757268992, "train/loss_slope": -3.1727270360397707e-06} {"step": 9430, "timestamp": 1778335902.670308, "train/loss": 2.3787401914596558, "train/z_loss": 0.0013603702653199434, "train/perplexity": 10.791299325751678, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949093.5696630827, "perf/iters_per_sec": 0.929400238830129, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0759627103805542, "data/tokens_consumed": 19778240512, "data/tokens_consumed_B": 19.778240512, "train/loss_slope": 7.442046456461694e-07} {"step": 9440, "timestamp": 1778335913.021827, "train/loss": 2.3303415298461916, "train/z_loss": 0.0013609082088805735, "train/perplexity": 10.281452356325737, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027062.3844684602, "perf/iters_per_sec": 0.966578666910391, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034576940536499, "data/tokens_consumed": 19799212032, "data/tokens_consumed_B": 19.799212032, "train/loss_slope": -1.7497050093345117e-06} {"step": 9450, "timestamp": 1778335923.37001, "grad/layer_0/attn": 0.003081339178606868, "grad/layer_0/mlp": 0.0033053234219551086, "grad/layer_0/attn_mlp_ratio": 0.9322352738361437, "grad/layer_4/attn": 0.00358309643343091, "grad/layer_4/mlp": 0.0025965250097215176, "grad/layer_4/attn_mlp_ratio": 1.379958321995663, "grad/layer_8/attn": 0.004778688307851553, "grad/layer_8/mlp": 0.003513444447889924, "grad/layer_8/attn_mlp_ratio": 1.3601149079530794, "grad/layer_12/attn": 0.006291667930781841, "grad/layer_12/mlp": 0.0068662152625620365, "grad/layer_12/attn_mlp_ratio": 0.9163225443068814, "grad/layer_16/attn": 0.005806641187518835, "grad/layer_16/mlp": 0.004785735160112381, "grad/layer_16/attn_mlp_ratio": 1.2133226916907396, "grad/layer_20/attn": 0.0036780056543648243, "grad/layer_20/mlp": 0.007328319828957319, "grad/layer_20/attn_mlp_ratio": 0.5018893402608495, "grad/layer_24/attn": 0.014929686672985554, "grad/layer_24/mlp": 0.014097564853727818, "grad/layer_24/attn_mlp_ratio": 1.0590259184468378, "grad/layer_27/attn": 0.005732521414756775, "grad/layer_27/mlp": 0.014734936878085136, "grad/layer_27/attn_mlp_ratio": 0.3890428186617015} {"step": 9450, "timestamp": 1778335923.972497, "eos/sharpness": 51.14755630493163, "eos/L0_probe": 2.317406415939331, "eos/L_plus": 2.5780811309814453, "eos/L_minus": 2.568207263946533, "eos/grad_norm": 0.2129475474357605, "eos/embed_grad_frac": 0.052745457738637924, "eos/time_s": 0.5996019840240479} {"step": 9450, "timestamp": 1778335923.9948924, "train/loss": 2.3428169012069704, "train/z_loss": 0.00135863299947232, "train/perplexity": 10.410520706441156, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912392.1925757702, "perf/iters_per_sec": 0.9118996584776736, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0966118812561034, "data/tokens_consumed": 19820183552, "data/tokens_consumed_B": 19.820183552, "train/loss_slope": -4.080876222502272e-06} {"step": 9450, "timestamp": 1778335925.3578255, "geo/rankme_last": 429.12469482421875, "geo/layer_0/stable_rank_q_proj": 20.779319763183594, "geo/layer_0/stable_rank_k_proj": 17.249958038330078, "geo/layer_0/stable_rank_o_proj": 44.63308334350586, "geo/layer_0/stable_rank_gate_proj": 126.55380249023438, "geo/layer_0/stable_rank_down_proj": 57.03630447387695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062362611293792725, "geo/layer_0/attn_entropy_mean": 6.236137390136719, "geo/layer_0/attn_entropy_std": 0.44664713740348816, "geo/layer_7/stable_rank_q_proj": 42.28550720214844, "geo/layer_7/stable_rank_k_proj": 38.69390106201172, "geo/layer_7/stable_rank_o_proj": 88.9617919921875, "geo/layer_7/stable_rank_gate_proj": 78.81951904296875, "geo/layer_7/stable_rank_down_proj": 143.68223571777344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40658289194107056, "geo/layer_7/attn_entropy_mean": 4.750844478607178, "geo/layer_7/attn_entropy_std": 0.7632162570953369, "geo/layer_14/stable_rank_q_proj": 51.66546630859375, "geo/layer_14/stable_rank_k_proj": 42.62177276611328, "geo/layer_14/stable_rank_o_proj": 42.43891143798828, "geo/layer_14/stable_rank_gate_proj": 72.05937957763672, "geo/layer_14/stable_rank_down_proj": 126.05843353271484, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3793623447418213, "geo/layer_14/attn_entropy_mean": 5.514481544494629, "geo/layer_14/attn_entropy_std": 0.45251044631004333, "geo/layer_21/stable_rank_q_proj": 39.21736526489258, "geo/layer_21/stable_rank_k_proj": 28.81414031982422, "geo/layer_21/stable_rank_o_proj": 65.29340362548828, "geo/layer_21/stable_rank_gate_proj": 60.984962463378906, "geo/layer_21/stable_rank_down_proj": 49.58575439453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13736183941364288, "geo/layer_21/attn_entropy_mean": 5.881074905395508, "geo/layer_21/attn_entropy_std": 0.3179108202457428, "geo/layer_27/stable_rank_q_proj": 44.37190246582031, "geo/layer_27/stable_rank_k_proj": 30.207178115844727, "geo/layer_27/stable_rank_o_proj": 107.81010437011719, "geo/layer_27/stable_rank_gate_proj": 71.12332153320312, "geo/layer_27/stable_rank_down_proj": 129.08705139160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09797278046607971, "geo/layer_27/attn_entropy_mean": 4.318483829498291, "geo/layer_27/attn_entropy_std": 0.6886031627655029, "attnres/final_alpha/block_0": 0.2557159662246704, "attnres/block_norm/0": 1.779465675354004, "attnres/final_alpha/block_1": 0.0038842475041747093, "attnres/block_norm/1": 50430.6171875, "attnres/final_alpha/block_2": 0.00863872654736042, "attnres/block_norm/2": 29995.0, "attnres/final_alpha/block_3": 0.010603077709674835, "attnres/block_norm/3": 71186.765625, "attnres/final_alpha/block_4": 0.011838535778224468, "attnres/block_norm/4": 17279.091796875, "attnres/final_alpha/block_5": 0.6084718704223633, "attnres/block_norm/5": 7172.73095703125, "attnres/final_alpha/block_6": 0.10084757953882217, "attnres/block_norm/6": 48002.5859375, "geo/tier1_time_s": 1.3587000370025635, "geo/step": 9450.0, "geo/rankme_slope": 0.00023073487207382954} {"step": 9460, "timestamp": 1778335935.709138, "train/loss": 2.325848340988159, "train/z_loss": 0.0013632687274366616, "train/perplexity": 10.235359478702412, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790871.2873754662, "perf/iters_per_sec": 0.8539539753796893, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1710232973098755, "data/tokens_consumed": 19841155072, "data/tokens_consumed_B": 19.841155072, "train/loss_slope": -5.135636869007597e-06} {"step": 9470, "timestamp": 1778335946.0553532, "train/loss": 2.356829214096069, "train/z_loss": 0.0013534598401747644, "train/perplexity": 10.557422996727482, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028353.8600238026, "perf/iters_per_sec": 0.9671944904440892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339182138442993, "data/tokens_consumed": 19862126592, "data/tokens_consumed_B": 19.862126592, "train/loss_slope": -5.16568949871364e-06} {"step": 9480, "timestamp": 1778335956.402349, "train/loss": 2.3377386331558228, "train/z_loss": 0.0013582330429926515, "train/perplexity": 10.357787302250271, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027698.310868175, "perf/iters_per_sec": 0.9668819002476573, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342524766921997, "data/tokens_consumed": 19883098112, "data/tokens_consumed_B": 19.883098112, "train/loss_slope": -3.343296966644265e-06} {"step": 9490, "timestamp": 1778335966.7522614, "train/loss": 2.288959097862244, "train/z_loss": 0.0013733230414800345, "train/perplexity": 9.864664185210188, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027415.4147788538, "perf/iters_per_sec": 0.9667470048803586, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343967914581298, "data/tokens_consumed": 19904069632, "data/tokens_consumed_B": 19.904069632, "train/loss_slope": -8.810322441355114e-06} {"step": 9500, "timestamp": 1778335977.0893643, "grad/layer_0/attn": 0.002627793001011014, "grad/layer_0/mlp": 0.0028894594870507717, "grad/layer_0/attn_mlp_ratio": 0.9094409947062673, "grad/layer_4/attn": 0.0020130123011767864, "grad/layer_4/mlp": 0.0025521672796458006, "grad/layer_4/attn_mlp_ratio": 0.7887461916608933, "grad/layer_8/attn": 0.004070015624165535, "grad/layer_8/mlp": 0.0033892812207341194, "grad/layer_8/attn_mlp_ratio": 1.200849159161556, "grad/layer_12/attn": 0.009400607086718082, "grad/layer_12/mlp": 0.006754686590284109, "grad/layer_12/attn_mlp_ratio": 1.391716228709741, "grad/layer_16/attn": 0.0038405193481594324, "grad/layer_16/mlp": 0.00428448710590601, "grad/layer_16/attn_mlp_ratio": 0.8963778309024746, "grad/layer_20/attn": 0.004100135527551174, "grad/layer_20/mlp": 0.005632683169096708, "grad/layer_20/attn_mlp_ratio": 0.7279187079533228, "grad/layer_24/attn": 0.0056008691899478436, "grad/layer_24/mlp": 0.007797771133482456, "grad/layer_24/attn_mlp_ratio": 0.7182653892048736, "grad/layer_27/attn": 0.00435506971552968, "grad/layer_27/mlp": 0.006831280887126923, "grad/layer_27/attn_mlp_ratio": 0.637518749958684} {"step": 9500, "timestamp": 1778335977.1049812, "train/loss": 2.33564875125885, "train/z_loss": 0.0013556292280554772, "train/perplexity": 10.336163353694328, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026987.4117273716, "perf/iters_per_sec": 0.9665429171215876, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346152067184449, "data/tokens_consumed": 19925041152, "data/tokens_consumed_B": 19.925041152, "train/loss_slope": -9.908916325745085e-06} {"step": 9500, "timestamp": 1778335983.9926326, "geo/ww_alpha_mean": 7.982847610538033, "geo/ww_alpha_std": 6.06865014203166, "geo/ww_alpha_min": 1.3618704207125438, "geo/ww_alpha_max": 55.55587881355267, "geo/ww_alpha_healthy_frac": 0.14720812182741116, "geo/ww_alpha_by_type/q_proj": 3.955479076996788, "geo/ww_alpha_by_type/k_proj": 4.351842171200883, "geo/ww_alpha_by_type/v_proj": 10.291267608444604, "geo/ww_alpha_by_type/o_proj": 9.950961697564155, "geo/ww_alpha_by_type/gate_proj": 7.82904070299746, "geo/ww_alpha_by_type/up_proj": 11.445308426401514, "geo/ww_alpha_by_type/down_proj": 8.162627977337639, "geo/twonn_id/layer_0": 0.7634983062744141, "geo/twonn_id/layer_7": 3.5719735622406006, "geo/twonn_id/layer_14": 5.4420037269592285, "geo/twonn_id/layer_21": 7.476568698883057, "geo/twonn_id/layer_27": 6.086569309234619, "geo/tier2_time_s": 6.881454706192017} {"step": 9500, "timestamp": 1778335984.7484093, "eoc/jacobian_sigma/layer_0/attn": 1537.5242919921875, "eoc/jacobian_sigma/layer_0/mlp": 10272.017578125, "eoc/jacobian_sigma/layer_0": 10272.017578125, "eoc/jacobian_sigma/layer_7/attn": 1.1277507543563843, "eoc/jacobian_sigma/layer_7/mlp": 1.7933212518692017, "eoc/jacobian_sigma/layer_7": 1.7933212518692017, "eoc/jacobian_sigma/layer_14/attn": 2.1146490573883057, "eoc/jacobian_sigma/layer_14/mlp": 14.050119400024414, "eoc/jacobian_sigma/layer_14": 14.050119400024414, "eoc/jacobian_sigma/layer_21/attn": 1.098985195159912, "eoc/jacobian_sigma/layer_21/mlp": 5.869105339050293, "eoc/jacobian_sigma/layer_21": 5.869105339050293, "eoc/jacobian_sigma/layer_27/attn": 3.9236814975738525, "eoc/jacobian_sigma/layer_27/mlp": 28.63975715637207, "eoc/jacobian_sigma/layer_27": 28.63975715637207, "eoc/layer0_sigma": 10272.017578125, "eoc/sigma_max": 28.63975715637207, "eoc/sigma_min": 1.7933212518692017, "eoc/sigma_mean": 12.588075786828995, "eoc/time_s": 0.7489070892333984} {"step": 9510, "timestamp": 1778335995.1149774, "train/loss": 2.337609791755676, "train/z_loss": 0.0013549354742281138, "train/perplexity": 10.356452876398324, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164762.4501252628, "perf/iters_per_sec": 0.5554020166994394, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8004976034164428, "data/tokens_consumed": 19946012672, "data/tokens_consumed_B": 19.946012672, "train/loss_slope": -1.0124591389039995e-05} {"step": 9520, "timestamp": 1778336005.4737918, "train/loss": 2.354110264778137, "train/z_loss": 0.0013604982057586312, "train/perplexity": 10.528756887180867, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025519.9510011734, "perf/iters_per_sec": 0.9658431773191325, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353647708892821, "data/tokens_consumed": 19966984192, "data/tokens_consumed_B": 19.966984192, "train/loss_slope": -7.432459969439002e-06} {"step": 9525, "timestamp": 1778336011.2430503, "eos/sharpness": 43.786978721618645, "eos/L0_probe": 2.32086181640625, "eos/L_plus": 2.5070137977600098, "eos/L_minus": 2.5725796222686768, "eos/grad_norm": 0.1178736686706543, "eos/embed_grad_frac": 0.1600937843322754, "eos/time_s": 0.6064326763153076} {"step": 9525, "timestamp": 1778336012.6292915, "geo/rankme_last": 430.04833984375, "geo/layer_0/stable_rank_q_proj": 20.767553329467773, "geo/layer_0/stable_rank_k_proj": 17.19554901123047, "geo/layer_0/stable_rank_o_proj": 44.60969543457031, "geo/layer_0/stable_rank_gate_proj": 126.69815826416016, "geo/layer_0/stable_rank_down_proj": 57.09541320800781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06134966388344765, "geo/layer_0/attn_entropy_mean": 6.239316463470459, "geo/layer_0/attn_entropy_std": 0.4488779306411743, "geo/layer_7/stable_rank_q_proj": 42.31476593017578, "geo/layer_7/stable_rank_k_proj": 38.6483039855957, "geo/layer_7/stable_rank_o_proj": 88.89836883544922, "geo/layer_7/stable_rank_gate_proj": 78.67439270019531, "geo/layer_7/stable_rank_down_proj": 143.79396057128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4051516056060791, "geo/layer_7/attn_entropy_mean": 4.748109340667725, "geo/layer_7/attn_entropy_std": 0.7655837535858154, "geo/layer_14/stable_rank_q_proj": 51.669273376464844, "geo/layer_14/stable_rank_k_proj": 42.60515594482422, "geo/layer_14/stable_rank_o_proj": 42.399559020996094, "geo/layer_14/stable_rank_gate_proj": 71.9787368774414, "geo/layer_14/stable_rank_down_proj": 126.05023193359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3816782832145691, "geo/layer_14/attn_entropy_mean": 5.541191101074219, "geo/layer_14/attn_entropy_std": 0.43436330556869507, "geo/layer_21/stable_rank_q_proj": 39.06647872924805, "geo/layer_21/stable_rank_k_proj": 28.75514030456543, "geo/layer_21/stable_rank_o_proj": 65.47374725341797, "geo/layer_21/stable_rank_gate_proj": 60.932777404785156, "geo/layer_21/stable_rank_down_proj": 49.51934051513672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13669565320014954, "geo/layer_21/attn_entropy_mean": 5.855477809906006, "geo/layer_21/attn_entropy_std": 0.3257032632827759, "geo/layer_27/stable_rank_q_proj": 44.28071975708008, "geo/layer_27/stable_rank_k_proj": 30.20359230041504, "geo/layer_27/stable_rank_o_proj": 107.79744720458984, "geo/layer_27/stable_rank_gate_proj": 71.08688354492188, "geo/layer_27/stable_rank_down_proj": 128.96446228027344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08800753206014633, "geo/layer_27/attn_entropy_mean": 4.31423282623291, "geo/layer_27/attn_entropy_std": 0.6897561550140381, "attnres/final_alpha/block_0": 0.25812599062919617, "attnres/block_norm/0": 1.779578685760498, "attnres/final_alpha/block_1": 0.004009990021586418, "attnres/block_norm/1": 50302.265625, "attnres/final_alpha/block_2": 0.008710991591215134, "attnres/block_norm/2": 29776.08203125, "attnres/final_alpha/block_3": 0.010687224566936493, "attnres/block_norm/3": 71526.984375, "attnres/final_alpha/block_4": 0.012094974517822266, "attnres/block_norm/4": 17251.943359375, "attnres/final_alpha/block_5": 0.6045883893966675, "attnres/block_norm/5": 7181.5263671875, "attnres/final_alpha/block_6": 0.1017824336886406, "attnres/block_norm/6": 47631.828125, "geo/tier1_time_s": 1.358626365661621, "geo/step": 9525.0, "geo/rankme_slope": 0.00026819995576355543} {"step": 9530, "timestamp": 1778336017.806208, "train/loss": 2.3108306646347048, "train/z_loss": 0.001357399858534336, "train/perplexity": 10.082796599947404, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701329.2436503237, "perf/iters_per_sec": 0.8112570017100924, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2326550006866455, "data/tokens_consumed": 19987955712, "data/tokens_consumed_B": 19.987955712, "train/loss_slope": -5.34357184802951e-06} {"step": 9540, "timestamp": 1778336028.1562421, "train/loss": 2.3389012813568115, "train/z_loss": 0.0013575363089330494, "train/perplexity": 10.369836768311153, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027429.0132967373, "perf/iters_per_sec": 0.966753489158982, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034389853477478, "data/tokens_consumed": 20008927232, "data/tokens_consumed_B": 20.008927232, "train/loss_slope": -4.642384981486707e-06} {"step": 9550, "timestamp": 1778336038.5041199, "grad/layer_0/attn": 0.0028526620008051395, "grad/layer_0/mlp": 0.0030281459912657738, "grad/layer_0/attn_mlp_ratio": 0.9420490012133855, "grad/layer_4/attn": 0.0017955859657377005, "grad/layer_4/mlp": 0.0024650180712342262, "grad/layer_4/attn_mlp_ratio": 0.7284270707175585, "grad/layer_8/attn": 0.004869906697422266, "grad/layer_8/mlp": 0.0034387726336717606, "grad/layer_8/attn_mlp_ratio": 1.4161757913621722, "grad/layer_12/attn": 0.006206815596669912, "grad/layer_12/mlp": 0.006727166939526796, "grad/layer_12/attn_mlp_ratio": 0.9226492459902577, "grad/layer_16/attn": 0.003250791458413005, "grad/layer_16/mlp": 0.004609584808349609, "grad/layer_16/attn_mlp_ratio": 0.7052243364743445, "grad/layer_20/attn": 0.003304628888145089, "grad/layer_20/mlp": 0.005819772835820913, "grad/layer_20/attn_mlp_ratio": 0.5678278043813323, "grad/layer_24/attn": 0.010561949573457241, "grad/layer_24/mlp": 0.011526201851665974, "grad/layer_24/attn_mlp_ratio": 0.9163425747481927, "grad/layer_27/attn": 0.004405357409268618, "grad/layer_27/mlp": 0.012345671653747559, "grad/layer_27/attn_mlp_ratio": 0.35683415995013484} {"step": 9550, "timestamp": 1778336038.519931, "train/loss": 2.3441304922103883, "train/z_loss": 0.0013472369639202953, "train/perplexity": 10.424204858503861, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025289.5158755025, "perf/iters_per_sec": 0.9657332972886574, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354825735092164, "data/tokens_consumed": 20029898752, "data/tokens_consumed_B": 20.029898752, "train/loss_slope": -5.067426436590454e-06} {"step": 9560, "timestamp": 1778336048.900221, "train/loss": 2.357112264633179, "train/z_loss": 0.0013662322075106204, "train/perplexity": 10.560411703934836, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021346.9124787874, "perf/iters_per_sec": 0.9638533174890458, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375022649765016, "data/tokens_consumed": 20050870272, "data/tokens_consumed_B": 20.050870272, "train/loss_slope": -4.7802967457237085e-06} {"step": 9570, "timestamp": 1778336059.2860768, "train/loss": 2.3636143684387205, "train/z_loss": 0.0013552920543588697, "train/perplexity": 10.629300314810365, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020559.2305325049, "perf/iters_per_sec": 0.9634777214682125, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379067182540893, "data/tokens_consumed": 20071841792, "data/tokens_consumed_B": 20.071841792, "train/loss_slope": -3.4222737611895438e-06} {"step": 9580, "timestamp": 1778336070.3622577, "train/loss": 2.3389581203460694, "train/z_loss": 0.0013586079236119985, "train/perplexity": 10.370426196102915, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1894548.8835629376, "perf/iters_per_sec": 0.9033913057150543, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1069400310516357, "data/tokens_consumed": 20092813312, "data/tokens_consumed_B": 20.092813312, "train/loss_slope": -4.854997943336753e-06} {"step": 9590, "timestamp": 1778336080.751275, "train/loss": 2.374320459365845, "train/z_loss": 0.0013469977187924088, "train/perplexity": 10.743709917473007, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019830.925270261, "perf/iters_per_sec": 0.9631304384566598, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038280963897705, "data/tokens_consumed": 20113784832, "data/tokens_consumed_B": 20.113784832, "train/loss_slope": -2.612189076306557e-06} {"step": 9600, "timestamp": 1778336091.119774, "grad/layer_0/attn": 0.0031479743774980307, "grad/layer_0/mlp": 0.0033805686980485916, "grad/layer_0/attn_mlp_ratio": 0.9311966611403311, "grad/layer_4/attn": 0.001932372571900487, "grad/layer_4/mlp": 0.002624013228341937, "grad/layer_4/attn_mlp_ratio": 0.7364187334831548, "grad/layer_8/attn": 0.003541846526786685, "grad/layer_8/mlp": 0.003427020274102688, "grad/layer_8/attn_mlp_ratio": 1.0335061190624122, "grad/layer_12/attn": 0.007361260708421469, "grad/layer_12/mlp": 0.006162135396152735, "grad/layer_12/attn_mlp_ratio": 1.1945957230277386, "grad/layer_16/attn": 0.003333797212690115, "grad/layer_16/mlp": 0.00439427001401782, "grad/layer_16/attn_mlp_ratio": 0.7586691591978443, "grad/layer_20/attn": 0.004425331484526396, "grad/layer_20/mlp": 0.006099180318415165, "grad/layer_20/attn_mlp_ratio": 0.7255616625415865, "grad/layer_24/attn": 0.00899997353553772, "grad/layer_24/mlp": 0.010861216112971306, "grad/layer_24/attn_mlp_ratio": 0.8286340460462663, "grad/layer_27/attn": 0.0053519899956882, "grad/layer_27/mlp": 0.010998043231666088, "grad/layer_27/attn_mlp_ratio": 0.48663110648768737} {"step": 9600, "timestamp": 1778336091.7190645, "eos/sharpness": 26.720976829528805, "eos/L0_probe": 2.320380926132202, "eos/L_plus": 2.4634785652160645, "eos/L_minus": 2.444493055343628, "eos/grad_norm": 0.1500004678964615, "eos/embed_grad_frac": 0.11126319319009781, "eos/time_s": 0.5966336727142334} {"step": 9600, "timestamp": 1778336091.7401056, "train/loss": 2.3452300310134886, "train/z_loss": 0.001357367739547044, "train/perplexity": 10.435672979904092, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909569.7637214628, "perf/iters_per_sec": 0.9105538195235552, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0982327222824098, "data/tokens_consumed": 20134756352, "data/tokens_consumed_B": 20.134756352, "train/loss_slope": -3.815846281512258e-06} {"step": 9600, "timestamp": 1778336093.1011522, "geo/rankme_last": 428.4444580078125, "geo/layer_0/stable_rank_q_proj": 20.775590896606445, "geo/layer_0/stable_rank_k_proj": 17.192655563354492, "geo/layer_0/stable_rank_o_proj": 44.53427505493164, "geo/layer_0/stable_rank_gate_proj": 126.38418579101562, "geo/layer_0/stable_rank_down_proj": 57.137367248535156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06725922226905823, "geo/layer_0/attn_entropy_mean": 6.23790979385376, "geo/layer_0/attn_entropy_std": 0.44634389877319336, "geo/layer_7/stable_rank_q_proj": 42.324073791503906, "geo/layer_7/stable_rank_k_proj": 38.82618713378906, "geo/layer_7/stable_rank_o_proj": 88.7340316772461, "geo/layer_7/stable_rank_gate_proj": 78.59790802001953, "geo/layer_7/stable_rank_down_proj": 143.7067108154297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4011269807815552, "geo/layer_7/attn_entropy_mean": 4.741511821746826, "geo/layer_7/attn_entropy_std": 0.7588397860527039, "geo/layer_14/stable_rank_q_proj": 51.739227294921875, "geo/layer_14/stable_rank_k_proj": 42.598384857177734, "geo/layer_14/stable_rank_o_proj": 42.43512725830078, "geo/layer_14/stable_rank_gate_proj": 72.0265121459961, "geo/layer_14/stable_rank_down_proj": 126.11421203613281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3726758658885956, "geo/layer_14/attn_entropy_mean": 5.541146755218506, "geo/layer_14/attn_entropy_std": 0.44470950961112976, "geo/layer_21/stable_rank_q_proj": 39.08888626098633, "geo/layer_21/stable_rank_k_proj": 28.806039810180664, "geo/layer_21/stable_rank_o_proj": 65.49287414550781, "geo/layer_21/stable_rank_gate_proj": 60.90903854370117, "geo/layer_21/stable_rank_down_proj": 49.46331787109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13695411384105682, "geo/layer_21/attn_entropy_mean": 5.84136438369751, "geo/layer_21/attn_entropy_std": 0.3219030499458313, "geo/layer_27/stable_rank_q_proj": 44.19523620605469, "geo/layer_27/stable_rank_k_proj": 30.2202205657959, "geo/layer_27/stable_rank_o_proj": 107.73043823242188, "geo/layer_27/stable_rank_gate_proj": 71.0228271484375, "geo/layer_27/stable_rank_down_proj": 128.9468536376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10179770737886429, "geo/layer_27/attn_entropy_mean": 4.301872253417969, "geo/layer_27/attn_entropy_std": 0.7016304135322571, "attnres/final_alpha/block_0": 0.2573015093803406, "attnres/block_norm/0": 1.7796177864074707, "attnres/final_alpha/block_1": 0.003946706186980009, "attnres/block_norm/1": 50381.796875, "attnres/final_alpha/block_2": 0.008600687608122826, "attnres/block_norm/2": 29834.37890625, "attnres/final_alpha/block_3": 0.010578708723187447, "attnres/block_norm/3": 71335.328125, "attnres/final_alpha/block_4": 0.012127608060836792, "attnres/block_norm/4": 17172.046875, "attnres/final_alpha/block_5": 0.607109546661377, "attnres/block_norm/5": 7127.1083984375, "attnres/final_alpha/block_6": 0.1003352478146553, "attnres/block_norm/6": 47816.8125, "geo/tier1_time_s": 1.356595516204834, "geo/step": 9600.0, "geo/rankme_slope": 0.00028607904099139654} {"step": 9610, "timestamp": 1778336103.8567414, "train/loss": 2.34701247215271, "train/z_loss": 0.0013623308041132987, "train/perplexity": 10.454290540162447, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1731415.6360429602, "perf/iters_per_sec": 0.8256033115591813, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2112354516983033, "data/tokens_consumed": 20155727872, "data/tokens_consumed_B": 20.155727872, "train/loss_slope": -3.263133830434682e-06} {"step": 9620, "timestamp": 1778336114.234973, "train/loss": 2.36094012260437, "train/z_loss": 0.0013591114548034966, "train/perplexity": 10.600912927066048, "train/grad_norm": 0.328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021865.2024606732, "perf/iters_per_sec": 0.9641004574111334, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372363090515138, "data/tokens_consumed": 20176699392, "data/tokens_consumed_B": 20.176699392, "train/loss_slope": -2.047097400398552e-06} {"step": 9630, "timestamp": 1778336124.6129546, "train/loss": 2.30314621925354, "train/z_loss": 0.001360724971164018, "train/perplexity": 10.005612837202843, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022128.5611898229, "perf/iters_per_sec": 0.9642260366391291, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371012210845947, "data/tokens_consumed": 20197670912, "data/tokens_consumed_B": 20.197670912, "train/loss_slope": -6.847909918211915e-06} {"step": 9640, "timestamp": 1778336135.5008101, "train/loss": 2.3855390548706055, "train/z_loss": 0.0013536894111894071, "train/perplexity": 10.864917873537829, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927389.6332312291, "perf/iters_per_sec": 0.9190509954601427, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0880789041519165, "data/tokens_consumed": 20218642432, "data/tokens_consumed_B": 20.218642432, "train/loss_slope": -4.07136580338173e-06} {"step": 9650, "timestamp": 1778336145.8694181, "grad/layer_0/attn": 0.0027379256207495928, "grad/layer_0/mlp": 0.0031654818449169397, "grad/layer_0/attn_mlp_ratio": 0.8649316813024609, "grad/layer_4/attn": 0.0017866374691948295, "grad/layer_4/mlp": 0.002560298191383481, "grad/layer_4/attn_mlp_ratio": 0.6978239509074559, "grad/layer_8/attn": 0.005893734749406576, "grad/layer_8/mlp": 0.003412769641727209, "grad/layer_8/attn_mlp_ratio": 1.726965249763304, "grad/layer_12/attn": 0.005521849729120731, "grad/layer_12/mlp": 0.007171478122472763, "grad/layer_12/attn_mlp_ratio": 0.7699737150169814, "grad/layer_16/attn": 0.007843359373509884, "grad/layer_16/mlp": 0.004601760301738977, "grad/layer_16/attn_mlp_ratio": 1.7044258476703669, "grad/layer_20/attn": 0.0033009895123541355, "grad/layer_20/mlp": 0.005896744318306446, "grad/layer_20/attn_mlp_ratio": 0.5597986411122402, "grad/layer_24/attn": 0.005811924580484629, "grad/layer_24/mlp": 0.007682343944907188, "grad/layer_24/attn_mlp_ratio": 0.7565301093664888, "grad/layer_27/attn": 0.0060588098131120205, "grad/layer_27/mlp": 0.006680218502879143, "grad/layer_27/attn_mlp_ratio": 0.9069777762213803} {"step": 9650, "timestamp": 1778336145.8853788, "train/loss": 2.33044011592865, "train/z_loss": 0.0013500480097718536, "train/perplexity": 10.282466014400999, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020507.433316322, "perf/iters_per_sec": 0.9634530226308451, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379333257675172, "data/tokens_consumed": 20239613952, "data/tokens_consumed_B": 20.239613952, "train/loss_slope": -3.3663825376448696e-06} {"step": 9660, "timestamp": 1778336156.2712548, "train/loss": 2.3507880687713625, "train/z_loss": 0.0013590195099823176, "train/perplexity": 10.493836331677134, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020639.205822761, "perf/iters_per_sec": 0.9635158566583447, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378656387329102, "data/tokens_consumed": 20260585472, "data/tokens_consumed_B": 20.260585472, "train/loss_slope": -2.8895794719394197e-06} {"step": 9670, "timestamp": 1778336166.6637406, "train/loss": 2.3336706161499023, "train/z_loss": 0.0013502525980584323, "train/perplexity": 10.315737235543939, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019242.058146945, "perf/iters_per_sec": 0.9628496447310185, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038583755493164, "data/tokens_consumed": 20281556992, "data/tokens_consumed_B": 20.281556992, "train/loss_slope": -1.707115081777238e-07} {"step": 9675, "timestamp": 1778336172.449781, "eos/sharpness": 31.91204071044921, "eos/L0_probe": 2.321195602416992, "eos/L_plus": 2.5154502391815186, "eos/L_minus": 2.446061372756958, "eos/grad_norm": 0.12347063422203064, "eos/embed_grad_frac": 0.14687928557395935, "eos/time_s": 0.6084251403808594} {"step": 9675, "timestamp": 1778336173.8316035, "geo/rankme_last": 430.3345031738281, "geo/layer_0/stable_rank_q_proj": 20.746728897094727, "geo/layer_0/stable_rank_k_proj": 17.172487258911133, "geo/layer_0/stable_rank_o_proj": 44.6018180847168, "geo/layer_0/stable_rank_gate_proj": 126.51536560058594, "geo/layer_0/stable_rank_down_proj": 57.109981536865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06240889057517052, "geo/layer_0/attn_entropy_mean": 6.236477375030518, "geo/layer_0/attn_entropy_std": 0.44644564390182495, "geo/layer_7/stable_rank_q_proj": 42.299530029296875, "geo/layer_7/stable_rank_k_proj": 38.807952880859375, "geo/layer_7/stable_rank_o_proj": 88.7144546508789, "geo/layer_7/stable_rank_gate_proj": 78.69546508789062, "geo/layer_7/stable_rank_down_proj": 143.62283325195312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4045770764350891, "geo/layer_7/attn_entropy_mean": 4.754621505737305, "geo/layer_7/attn_entropy_std": 0.7664678692817688, "geo/layer_14/stable_rank_q_proj": 51.80717086791992, "geo/layer_14/stable_rank_k_proj": 42.540950775146484, "geo/layer_14/stable_rank_o_proj": 42.41674041748047, "geo/layer_14/stable_rank_gate_proj": 72.00484466552734, "geo/layer_14/stable_rank_down_proj": 126.25653076171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802492022514343, "geo/layer_14/attn_entropy_mean": 5.550825119018555, "geo/layer_14/attn_entropy_std": 0.4679912328720093, "geo/layer_21/stable_rank_q_proj": 39.164222717285156, "geo/layer_21/stable_rank_k_proj": 28.763940811157227, "geo/layer_21/stable_rank_o_proj": 65.49418640136719, "geo/layer_21/stable_rank_gate_proj": 60.889404296875, "geo/layer_21/stable_rank_down_proj": 49.524871826171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13457490503787994, "geo/layer_21/attn_entropy_mean": 5.855093002319336, "geo/layer_21/attn_entropy_std": 0.31643304228782654, "geo/layer_27/stable_rank_q_proj": 44.18022918701172, "geo/layer_27/stable_rank_k_proj": 30.115354537963867, "geo/layer_27/stable_rank_o_proj": 107.66570281982422, "geo/layer_27/stable_rank_gate_proj": 70.9777603149414, "geo/layer_27/stable_rank_down_proj": 129.2014923095703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09947963804006577, "geo/layer_27/attn_entropy_mean": 4.312058925628662, "geo/layer_27/attn_entropy_std": 0.70137619972229, "attnres/final_alpha/block_0": 0.2550460696220398, "attnres/block_norm/0": 1.7796099185943604, "attnres/final_alpha/block_1": 0.003894092980772257, "attnres/block_norm/1": 50553.28125, "attnres/final_alpha/block_2": 0.008560322225093842, "attnres/block_norm/2": 29881.83984375, "attnres/final_alpha/block_3": 0.010570723563432693, "attnres/block_norm/3": 71335.71875, "attnres/final_alpha/block_4": 0.01199941523373127, "attnres/block_norm/4": 17295.20703125, "attnres/final_alpha/block_5": 0.6102906465530396, "attnres/block_norm/5": 7141.0517578125, "attnres/final_alpha/block_6": 0.0996386930346489, "attnres/block_norm/6": 47654.8828125, "geo/tier1_time_s": 1.3598098754882812, "geo/step": 9675.0, "geo/rankme_slope": 0.00037065492212510007} {"step": 9680, "timestamp": 1778336179.022046, "train/loss": 2.2959821939468386, "train/z_loss": 0.0013637560768984258, "train/perplexity": 9.934188521906336, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697645.871459613, "perf/iters_per_sec": 0.8095006329820695, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2353294849395753, "data/tokens_consumed": 20302528512, "data/tokens_consumed_B": 20.302528512, "train/loss_slope": -1.8079038500391643e-06} {"step": 9690, "timestamp": 1778336189.4004033, "train/loss": 2.3142956256866456, "train/z_loss": 0.0013521106797270478, "train/perplexity": 10.11779369423072, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021858.0918895362, "perf/iters_per_sec": 0.9640970668265992, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037239956855774, "data/tokens_consumed": 20323500032, "data/tokens_consumed_B": 20.323500032, "train/loss_slope": -2.256603509929761e-06} {"step": 9700, "timestamp": 1778336199.7708848, "grad/layer_0/attn": 0.0031818016432225704, "grad/layer_0/mlp": 0.0033686708193272352, "grad/layer_0/attn_mlp_ratio": 0.9445272985756694, "grad/layer_4/attn": 0.0016182876424863935, "grad/layer_4/mlp": 0.002508604433387518, "grad/layer_4/attn_mlp_ratio": 0.6450947612301104, "grad/layer_8/attn": 0.005864548496901989, "grad/layer_8/mlp": 0.0032388302497565746, "grad/layer_8/attn_mlp_ratio": 1.8106995006215003, "grad/layer_12/attn": 0.004638328682631254, "grad/layer_12/mlp": 0.006675057578831911, "grad/layer_12/attn_mlp_ratio": 0.6948746970891988, "grad/layer_16/attn": 0.003490792354568839, "grad/layer_16/mlp": 0.004449609201401472, "grad/layer_16/attn_mlp_ratio": 0.7845165986751603, "grad/layer_20/attn": 0.003093911102041602, "grad/layer_20/mlp": 0.006388021167367697, "grad/layer_20/attn_mlp_ratio": 0.484330119225872, "grad/layer_24/attn": 0.013208916410803795, "grad/layer_24/mlp": 0.011679559014737606, "grad/layer_24/attn_mlp_ratio": 1.1309430673745555, "grad/layer_27/attn": 0.007223449181765318, "grad/layer_27/mlp": 0.00946978759020567, "grad/layer_27/attn_mlp_ratio": 0.7627889260111211} {"step": 9700, "timestamp": 1778336199.787432, "train/loss": 2.351699948310852, "train/z_loss": 0.0013628930901177227, "train/perplexity": 10.50340981058513, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019980.8398273177, "perf/iters_per_sec": 0.96320192328802, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0382039070129394, "data/tokens_consumed": 20344471552, "data/tokens_consumed_B": 20.344471552, "train/loss_slope": -3.4287844840162407e-06} {"step": 9710, "timestamp": 1778336210.1417997, "train/loss": 2.320791482925415, "train/z_loss": 0.0013662612298503518, "train/perplexity": 10.183731366633499, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026653.4883837418, "perf/iters_per_sec": 0.9663836900633535, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347856760025025, "data/tokens_consumed": 20365443072, "data/tokens_consumed_B": 20.365443072, "train/loss_slope": -3.35352262719549e-06} {"step": 9720, "timestamp": 1778336220.4966385, "train/loss": 2.3405232191085816, "train/z_loss": 0.001364420319441706, "train/perplexity": 10.386669645294534, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026599.930781147, "perf/iters_per_sec": 0.966358151808332, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348130226135255, "data/tokens_consumed": 20386414592, "data/tokens_consumed_B": 20.386414592, "train/loss_slope": -1.8910672977239352e-06} {"step": 9730, "timestamp": 1778336230.8622935, "train/loss": 2.3822965383529664, "train/z_loss": 0.0013484386610798538, "train/perplexity": 10.829745252589108, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024386.2366132662, "perf/iters_per_sec": 0.9653025801721888, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359446048736571, "data/tokens_consumed": 20407386112, "data/tokens_consumed_B": 20.407386112, "train/loss_slope": 1.2507946112834477e-06} {"step": 9740, "timestamp": 1778336241.2492418, "train/loss": 2.3602362394332888, "train/z_loss": 0.0013518761727027595, "train/perplexity": 10.59345374836174, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020284.5870586338, "perf/iters_per_sec": 0.9633467612546128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380478143692016, "data/tokens_consumed": 20428357632, "data/tokens_consumed_B": 20.428357632, "train/loss_slope": 8.565776240100716e-07} {"step": 9750, "timestamp": 1778336251.5999846, "grad/layer_0/attn": 0.0034079360775649548, "grad/layer_0/mlp": 0.0035162544809281826, "grad/layer_0/attn_mlp_ratio": 0.9691949200860663, "grad/layer_4/attn": 0.0020480721723288298, "grad/layer_4/mlp": 0.0026377656031399965, "grad/layer_4/attn_mlp_ratio": 0.77644203573153, "grad/layer_8/attn": 0.004520581569522619, "grad/layer_8/mlp": 0.0034871380776166916, "grad/layer_8/attn_mlp_ratio": 1.2963586010268862, "grad/layer_12/attn": 0.007102260831743479, "grad/layer_12/mlp": 0.0072052632458508015, "grad/layer_12/attn_mlp_ratio": 0.9857045455296736, "grad/layer_16/attn": 0.0036865011788904667, "grad/layer_16/mlp": 0.0046611507423222065, "grad/layer_16/attn_mlp_ratio": 0.7908993515974341, "grad/layer_20/attn": 0.003558563068509102, "grad/layer_20/mlp": 0.006754188798367977, "grad/layer_20/attn_mlp_ratio": 0.5268675664918051, "grad/layer_24/attn": 0.015332582406699657, "grad/layer_24/mlp": 0.01353489700704813, "grad/layer_24/attn_mlp_ratio": 1.1328185419832564, "grad/layer_27/attn": 0.005441031884402037, "grad/layer_27/mlp": 0.013328326866030693, "grad/layer_27/attn_mlp_ratio": 0.4082306727820641} {"step": 9750, "timestamp": 1778336252.1978688, "eos/sharpness": 50.75986385345458, "eos/L0_probe": 2.324885368347168, "eos/L_plus": 2.5807597637176514, "eos/L_minus": 2.5766096115112305, "eos/grad_norm": 0.19904513657093048, "eos/embed_grad_frac": 0.06578442454338074, "eos/time_s": 0.5951292514801025} {"step": 9750, "timestamp": 1778336252.2171955, "train/loss": 2.3126621723175047, "train/z_loss": 0.001357792958151549, "train/perplexity": 10.101280240682499, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913304.5173799125, "perf/iters_per_sec": 0.9123346888446391, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.096088981628418, "data/tokens_consumed": 20449329152, "data/tokens_consumed_B": 20.449329152, "train/loss_slope": -9.511685249792239e-07} {"step": 9750, "timestamp": 1778336253.5832508, "geo/rankme_last": 428.86614990234375, "geo/layer_0/stable_rank_q_proj": 20.71151351928711, "geo/layer_0/stable_rank_k_proj": 17.154191970825195, "geo/layer_0/stable_rank_o_proj": 44.62730407714844, "geo/layer_0/stable_rank_gate_proj": 126.38209533691406, "geo/layer_0/stable_rank_down_proj": 57.20573425292969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06385093182325363, "geo/layer_0/attn_entropy_mean": 6.237349033355713, "geo/layer_0/attn_entropy_std": 0.4472496211528778, "geo/layer_7/stable_rank_q_proj": 42.277225494384766, "geo/layer_7/stable_rank_k_proj": 38.88254928588867, "geo/layer_7/stable_rank_o_proj": 88.64363861083984, "geo/layer_7/stable_rank_gate_proj": 78.75447082519531, "geo/layer_7/stable_rank_down_proj": 143.52627563476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3920706510543823, "geo/layer_7/attn_entropy_mean": 4.754753589630127, "geo/layer_7/attn_entropy_std": 0.7632536292076111, "geo/layer_14/stable_rank_q_proj": 51.75800704956055, "geo/layer_14/stable_rank_k_proj": 42.55078125, "geo/layer_14/stable_rank_o_proj": 42.395450592041016, "geo/layer_14/stable_rank_gate_proj": 71.92765808105469, "geo/layer_14/stable_rank_down_proj": 126.57270812988281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3674584627151489, "geo/layer_14/attn_entropy_mean": 5.495720863342285, "geo/layer_14/attn_entropy_std": 0.4504603147506714, "geo/layer_21/stable_rank_q_proj": 39.105899810791016, "geo/layer_21/stable_rank_k_proj": 28.669189453125, "geo/layer_21/stable_rank_o_proj": 65.49286651611328, "geo/layer_21/stable_rank_gate_proj": 60.88869857788086, "geo/layer_21/stable_rank_down_proj": 49.578067779541016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1377965658903122, "geo/layer_21/attn_entropy_mean": 5.862348556518555, "geo/layer_21/attn_entropy_std": 0.32285818457603455, "geo/layer_27/stable_rank_q_proj": 44.11899185180664, "geo/layer_27/stable_rank_k_proj": 30.085044860839844, "geo/layer_27/stable_rank_o_proj": 107.4151611328125, "geo/layer_27/stable_rank_gate_proj": 70.96843719482422, "geo/layer_27/stable_rank_down_proj": 129.0947723388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09675389528274536, "geo/layer_27/attn_entropy_mean": 4.334320068359375, "geo/layer_27/attn_entropy_std": 0.6920362710952759, "attnres/final_alpha/block_0": 0.25770947337150574, "attnres/block_norm/0": 1.7798855304718018, "attnres/final_alpha/block_1": 0.003998140804469585, "attnres/block_norm/1": 50386.828125, "attnres/final_alpha/block_2": 0.008525092154741287, "attnres/block_norm/2": 29851.63671875, "attnres/final_alpha/block_3": 0.010571395047008991, "attnres/block_norm/3": 71505.796875, "attnres/final_alpha/block_4": 0.011993467807769775, "attnres/block_norm/4": 17363.263671875, "attnres/final_alpha/block_5": 0.6057065725326538, "attnres/block_norm/5": 7194.4443359375, "attnres/final_alpha/block_6": 0.1014958918094635, "attnres/block_norm/6": 48120.828125, "geo/tier1_time_s": 1.3619415760040283, "geo/step": 9750.0, "geo/rankme_slope": 0.0003746642993134754} {"step": 9760, "timestamp": 1778336263.9595244, "train/loss": 2.3343181848526, "train/z_loss": 0.0013764903764240445, "train/perplexity": 10.322419547517462, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786602.1413179224, "perf/iters_per_sec": 0.851918287905656, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.173821496963501, "data/tokens_consumed": 20470300672, "data/tokens_consumed_B": 20.470300672, "train/loss_slope": -7.127599890725399e-07} {"step": 9770, "timestamp": 1778336274.338853, "train/loss": 2.3468653917312623, "train/z_loss": 0.001357431139331311, "train/perplexity": 10.452753031775323, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021961.4557003067, "perf/iters_per_sec": 0.9641463545323881, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371869325637817, "data/tokens_consumed": 20491272192, "data/tokens_consumed_B": 20.491272192, "train/loss_slope": -1.8750800765958097e-06} {"step": 9780, "timestamp": 1778336284.707017, "train/loss": 2.3810131311416627, "train/z_loss": 0.0013445452554151417, "train/perplexity": 10.815855194642186, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024092.153807562, "perf/iters_per_sec": 0.9651623505628405, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360951185226441, "data/tokens_consumed": 20512243712, "data/tokens_consumed_B": 20.512243712, "train/loss_slope": 4.789044540234137e-09} {"step": 9790, "timestamp": 1778336295.0963902, "train/loss": 2.3215897560119627, "train/z_loss": 0.0013605465763248503, "train/perplexity": 10.191864010907766, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019669.8568605124, "perf/iters_per_sec": 0.963053635053879, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038363766670227, "data/tokens_consumed": 20533215232, "data/tokens_consumed_B": 20.533215232, "train/loss_slope": -2.4636592229778694e-06} {"step": 9800, "timestamp": 1778336305.4521441, "grad/layer_0/attn": 0.0027491385117173195, "grad/layer_0/mlp": 0.0030213736463338137, "grad/layer_0/attn_mlp_ratio": 0.9098968689501492, "grad/layer_4/attn": 0.002252548700198531, "grad/layer_4/mlp": 0.002601444721221924, "grad/layer_4/attn_mlp_ratio": 0.8658837127056522, "grad/layer_8/attn": 0.0058283195830881596, "grad/layer_8/mlp": 0.003485335037112236, "grad/layer_8/attn_mlp_ratio": 1.6722407899967968, "grad/layer_12/attn": 0.00824650563299656, "grad/layer_12/mlp": 0.006414959207177162, "grad/layer_12/attn_mlp_ratio": 1.2855117605766, "grad/layer_16/attn": 0.0038449312560260296, "grad/layer_16/mlp": 0.004318344406783581, "grad/layer_16/attn_mlp_ratio": 0.8903715879976973, "grad/layer_20/attn": 0.003928883001208305, "grad/layer_20/mlp": 0.006358655169606209, "grad/layer_20/attn_mlp_ratio": 0.6178795412904372, "grad/layer_24/attn": 0.01741548255085945, "grad/layer_24/mlp": 0.01511469203978777, "grad/layer_24/attn_mlp_ratio": 1.152222115395596, "grad/layer_27/attn": 0.004921057261526585, "grad/layer_27/mlp": 0.016003817319869995, "grad/layer_27/attn_mlp_ratio": 0.30749271454551247} {"step": 9800, "timestamp": 1778336305.4678884, "train/loss": 2.3661117553710938, "train/z_loss": 0.0013609736342914402, "train/perplexity": 10.655878965289915, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023629.7972259724, "perf/iters_per_sec": 0.9649418817643988, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036331844329834, "data/tokens_consumed": 20554186752, "data/tokens_consumed_B": 20.554186752, "train/loss_slope": 4.353186049120285e-07} {"step": 9810, "timestamp": 1778336315.8281744, "train/loss": 2.3230762243270875, "train/z_loss": 0.0013613451039418579, "train/perplexity": 10.207025159323754, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025740.2682087678, "perf/iters_per_sec": 0.9659482327503051, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352521657943725, "data/tokens_consumed": 20575158272, "data/tokens_consumed_B": 20.575158272, "train/loss_slope": -1.1141603500177729e-06} {"step": 9820, "timestamp": 1778336326.20248, "train/loss": 2.331398296356201, "train/z_loss": 0.0013601684011518955, "train/perplexity": 10.292323193806979, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022425.7006588243, "perf/iters_per_sec": 0.9643677237791177, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036948847770691, "data/tokens_consumed": 20596129792, "data/tokens_consumed_B": 20.596129792, "train/loss_slope": -1.6484136950529173e-06} {"step": 9825, "timestamp": 1778336331.9901586, "eos/sharpness": 49.29018020629882, "eos/L0_probe": 2.318735122680664, "eos/L_plus": 2.5932393074035645, "eos/L_minus": 2.537132740020752, "eos/grad_norm": 0.18455596268177032, "eos/embed_grad_frac": 0.07206167280673981, "eos/time_s": 0.6086721420288086} {"step": 9825, "timestamp": 1778336333.381239, "geo/rankme_last": 429.6629943847656, "geo/layer_0/stable_rank_q_proj": 20.738801956176758, "geo/layer_0/stable_rank_k_proj": 17.197572708129883, "geo/layer_0/stable_rank_o_proj": 44.594058990478516, "geo/layer_0/stable_rank_gate_proj": 126.25837707519531, "geo/layer_0/stable_rank_down_proj": 57.37289047241211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07014838606119156, "geo/layer_0/attn_entropy_mean": 6.241283416748047, "geo/layer_0/attn_entropy_std": 0.43953999876976013, "geo/layer_7/stable_rank_q_proj": 42.28166198730469, "geo/layer_7/stable_rank_k_proj": 38.93495178222656, "geo/layer_7/stable_rank_o_proj": 88.65648651123047, "geo/layer_7/stable_rank_gate_proj": 78.66997528076172, "geo/layer_7/stable_rank_down_proj": 143.71023559570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4050661325454712, "geo/layer_7/attn_entropy_mean": 4.757236957550049, "geo/layer_7/attn_entropy_std": 0.755789041519165, "geo/layer_14/stable_rank_q_proj": 51.72104263305664, "geo/layer_14/stable_rank_k_proj": 42.55580520629883, "geo/layer_14/stable_rank_o_proj": 42.383766174316406, "geo/layer_14/stable_rank_gate_proj": 72.01738739013672, "geo/layer_14/stable_rank_down_proj": 126.53512573242188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37634867429733276, "geo/layer_14/attn_entropy_mean": 5.520798206329346, "geo/layer_14/attn_entropy_std": 0.43368324637413025, "geo/layer_21/stable_rank_q_proj": 39.15604782104492, "geo/layer_21/stable_rank_k_proj": 28.709774017333984, "geo/layer_21/stable_rank_o_proj": 65.4734878540039, "geo/layer_21/stable_rank_gate_proj": 60.862709045410156, "geo/layer_21/stable_rank_down_proj": 49.508609771728516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13731205463409424, "geo/layer_21/attn_entropy_mean": 5.860644340515137, "geo/layer_21/attn_entropy_std": 0.3062871992588043, "geo/layer_27/stable_rank_q_proj": 44.11155319213867, "geo/layer_27/stable_rank_k_proj": 30.069072723388672, "geo/layer_27/stable_rank_o_proj": 107.1607437133789, "geo/layer_27/stable_rank_gate_proj": 70.87789916992188, "geo/layer_27/stable_rank_down_proj": 129.2511444091797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11040867865085602, "geo/layer_27/attn_entropy_mean": 4.317228317260742, "geo/layer_27/attn_entropy_std": 0.710548996925354, "attnres/final_alpha/block_0": 0.2554914057254791, "attnres/block_norm/0": 1.779761552810669, "attnres/final_alpha/block_1": 0.0038676792755723, "attnres/block_norm/1": 50507.703125, "attnres/final_alpha/block_2": 0.00839061290025711, "attnres/block_norm/2": 29839.908203125, "attnres/final_alpha/block_3": 0.010462356731295586, "attnres/block_norm/3": 71555.921875, "attnres/final_alpha/block_4": 0.011762892827391624, "attnres/block_norm/4": 17241.638671875, "attnres/final_alpha/block_5": 0.6120034456253052, "attnres/block_norm/5": 7134.23046875, "attnres/final_alpha/block_6": 0.09802159667015076, "attnres/block_norm/6": 47802.22265625, "geo/tier1_time_s": 1.3606774806976318, "geo/step": 9825.0, "geo/rankme_slope": 0.0003903177481930272} {"step": 9830, "timestamp": 1778336338.5691643, "train/loss": 2.357621693611145, "train/z_loss": 0.0013604102889075876, "train/perplexity": 10.56579285421664, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696733.7376480787, "perf/iters_per_sec": 0.8090656936874765, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2359935760498046, "data/tokens_consumed": 20617101312, "data/tokens_consumed_B": 20.617101312, "train/loss_slope": -1.1163526278087987e-06} {"step": 9840, "timestamp": 1778336348.9277878, "train/loss": 2.3349552154541016, "train/z_loss": 0.0013526224298402666, "train/perplexity": 10.328997339555734, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025566.5014378238, "perf/iters_per_sec": 0.965865374297058, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353409767150878, "data/tokens_consumed": 20638072832, "data/tokens_consumed_B": 20.638072832, "train/loss_slope": -5.201954187804032e-07} {"step": 9850, "timestamp": 1778336359.2818754, "grad/layer_0/attn": 0.0030695467721670866, "grad/layer_0/mlp": 0.003269592300057411, "grad/layer_0/attn_mlp_ratio": 0.938816340566851, "grad/layer_4/attn": 0.0019499040208756924, "grad/layer_4/mlp": 0.0027142963372170925, "grad/layer_4/attn_mlp_ratio": 0.7183828538915541, "grad/layer_8/attn": 0.0069628190249204636, "grad/layer_8/mlp": 0.003432457335293293, "grad/layer_8/attn_mlp_ratio": 2.0285230497914424, "grad/layer_12/attn": 0.0079886419698596, "grad/layer_12/mlp": 0.006311233155429363, "grad/layer_12/attn_mlp_ratio": 1.2657814481800707, "grad/layer_16/attn": 0.003998440224677324, "grad/layer_16/mlp": 0.004552731290459633, "grad/layer_16/attn_mlp_ratio": 0.8782508524566506, "grad/layer_20/attn": 0.003292944747954607, "grad/layer_20/mlp": 0.005841291043907404, "grad/layer_20/attn_mlp_ratio": 0.5637357678001073, "grad/layer_24/attn": 0.01081327348947525, "grad/layer_24/mlp": 0.009282920509576797, "grad/layer_24/attn_mlp_ratio": 1.1648568316225447, "grad/layer_27/attn": 0.006007714197039604, "grad/layer_27/mlp": 0.008852509781718254, "grad/layer_27/attn_mlp_ratio": 0.6786452969057313} {"step": 9850, "timestamp": 1778336359.298232, "train/loss": 2.3622587442398073, "train/z_loss": 0.00135538981994614, "train/perplexity": 10.614900740496985, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023219.2597690371, "perf/iters_per_sec": 0.9647461222500978, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365421295166015, "data/tokens_consumed": 20659044352, "data/tokens_consumed_B": 20.659044352, "train/loss_slope": 4.5614165298373344e-07} {"step": 9860, "timestamp": 1778336369.6640468, "train/loss": 2.3253277063369753, "train/z_loss": 0.001369201799388975, "train/perplexity": 10.230031982850289, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024539.1580326217, "perf/iters_per_sec": 0.9653754987872227, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358663558959962, "data/tokens_consumed": 20680015872, "data/tokens_consumed_B": 20.680015872, "train/loss_slope": -2.641023904254255e-07} {"step": 9870, "timestamp": 1778336380.031255, "train/loss": 2.349891424179077, "train/z_loss": 0.0013577085686847568, "train/perplexity": 10.484431307190295, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023834.6162224552, "perf/iters_per_sec": 0.9650395470726276, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362269639968873, "data/tokens_consumed": 20700987392, "data/tokens_consumed_B": 20.700987392, "train/loss_slope": 1.3122203219163025e-06} {"step": 9880, "timestamp": 1778336390.3905575, "train/loss": 2.29105806350708, "train/z_loss": 0.0013633725000545382, "train/perplexity": 9.88539152180671, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025341.6984728219, "perf/iters_per_sec": 0.9657581798900708, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354558944702148, "data/tokens_consumed": 20721958912, "data/tokens_consumed_B": 20.721958912, "train/loss_slope": -2.681298989846104e-06} {"step": 9890, "timestamp": 1778336400.7561705, "train/loss": 2.3476618766784667, "train/z_loss": 0.0013584191328845918, "train/perplexity": 10.461081808654377, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024521.4511402703, "perf/iters_per_sec": 0.9653670554829933, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035875415802002, "data/tokens_consumed": 20742930432, "data/tokens_consumed_B": 20.742930432, "train/loss_slope": 5.573477431743539e-07} {"step": 9900, "timestamp": 1778336411.1066968, "grad/layer_0/attn": 0.0029897785279899836, "grad/layer_0/mlp": 0.003081058617681265, "grad/layer_0/attn_mlp_ratio": 0.9703737584852077, "grad/layer_4/attn": 0.00242703128606081, "grad/layer_4/mlp": 0.002651665359735489, "grad/layer_4/attn_mlp_ratio": 0.9152856281889018, "grad/layer_8/attn": 0.00731491856276989, "grad/layer_8/mlp": 0.0035521548707038164, "grad/layer_8/attn_mlp_ratio": 2.0592903809375547, "grad/layer_12/attn": 0.0057247416116297245, "grad/layer_12/mlp": 0.007767826784402132, "grad/layer_12/attn_mlp_ratio": 0.7369811012556239, "grad/layer_16/attn": 0.0038893248420208693, "grad/layer_16/mlp": 0.00423494353890419, "grad/layer_16/attn_mlp_ratio": 0.9183888083637517, "grad/layer_20/attn": 0.00307516660541296, "grad/layer_20/mlp": 0.006223312113434076, "grad/layer_20/attn_mlp_ratio": 0.4941366429880685, "grad/layer_24/attn": 0.014163435436785221, "grad/layer_24/mlp": 0.012579905800521374, "grad/layer_24/attn_mlp_ratio": 1.1258776932662284, "grad/layer_27/attn": 0.008299139328300953, "grad/layer_27/mlp": 0.012828109785914421, "grad/layer_27/attn_mlp_ratio": 0.6469495040273713} {"step": 9900, "timestamp": 1778336411.7166455, "eos/sharpness": 60.44094562530516, "eos/L0_probe": 2.316575288772583, "eos/L_plus": 2.6013894081115723, "eos/L_minus": 2.6361706256866455, "eos/grad_norm": 0.21538586914539337, "eos/embed_grad_frac": 0.05525555461645126, "eos/time_s": 0.6071157455444336} {"step": 9900, "timestamp": 1778336411.7362375, "train/loss": 2.314563536643982, "train/z_loss": 0.0013586001587100327, "train/perplexity": 10.120504725166704, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910823.3687333732, "perf/iters_per_sec": 0.9111515849749438, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0975122213363648, "data/tokens_consumed": 20763901952, "data/tokens_consumed_B": 20.763901952, "train/loss_slope": -2.577260961913092e-06} {"step": 9900, "timestamp": 1778336413.1028903, "geo/rankme_last": 429.13946533203125, "geo/layer_0/stable_rank_q_proj": 20.76058578491211, "geo/layer_0/stable_rank_k_proj": 17.207834243774414, "geo/layer_0/stable_rank_o_proj": 44.54459762573242, "geo/layer_0/stable_rank_gate_proj": 126.20745849609375, "geo/layer_0/stable_rank_down_proj": 57.371280670166016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06455960869789124, "geo/layer_0/attn_entropy_mean": 6.244200706481934, "geo/layer_0/attn_entropy_std": 0.4426754415035248, "geo/layer_7/stable_rank_q_proj": 42.329872131347656, "geo/layer_7/stable_rank_k_proj": 38.954917907714844, "geo/layer_7/stable_rank_o_proj": 88.66763305664062, "geo/layer_7/stable_rank_gate_proj": 78.77386474609375, "geo/layer_7/stable_rank_down_proj": 143.92367553710938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3990454077720642, "geo/layer_7/attn_entropy_mean": 4.748391628265381, "geo/layer_7/attn_entropy_std": 0.7648336887359619, "geo/layer_14/stable_rank_q_proj": 51.69669723510742, "geo/layer_14/stable_rank_k_proj": 42.65238952636719, "geo/layer_14/stable_rank_o_proj": 42.412357330322266, "geo/layer_14/stable_rank_gate_proj": 71.97659301757812, "geo/layer_14/stable_rank_down_proj": 126.63921356201172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3780885934829712, "geo/layer_14/attn_entropy_mean": 5.5236358642578125, "geo/layer_14/attn_entropy_std": 0.46071815490722656, "geo/layer_21/stable_rank_q_proj": 39.13465118408203, "geo/layer_21/stable_rank_k_proj": 28.710098266601562, "geo/layer_21/stable_rank_o_proj": 65.47992706298828, "geo/layer_21/stable_rank_gate_proj": 60.94480895996094, "geo/layer_21/stable_rank_down_proj": 49.52878189086914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1382087916135788, "geo/layer_21/attn_entropy_mean": 5.853878021240234, "geo/layer_21/attn_entropy_std": 0.32298707962036133, "geo/layer_27/stable_rank_q_proj": 44.038490295410156, "geo/layer_27/stable_rank_k_proj": 30.146821975708008, "geo/layer_27/stable_rank_o_proj": 107.14674377441406, "geo/layer_27/stable_rank_gate_proj": 70.8598403930664, "geo/layer_27/stable_rank_down_proj": 129.24554443359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1009511798620224, "geo/layer_27/attn_entropy_mean": 4.305140018463135, "geo/layer_27/attn_entropy_std": 0.6960572600364685, "attnres/final_alpha/block_0": 0.2582482099533081, "attnres/block_norm/0": 1.7800918817520142, "attnres/final_alpha/block_1": 0.003989716526120901, "attnres/block_norm/1": 50457.7578125, "attnres/final_alpha/block_2": 0.008606372401118279, "attnres/block_norm/2": 29808.24609375, "attnres/final_alpha/block_3": 0.01048717088997364, "attnres/block_norm/3": 71834.859375, "attnres/final_alpha/block_4": 0.012048602104187012, "attnres/block_norm/4": 17257.431640625, "attnres/final_alpha/block_5": 0.604326605796814, "attnres/block_norm/5": 7181.7197265625, "attnres/final_alpha/block_6": 0.10229333490133286, "attnres/block_norm/6": 47693.890625, "geo/tier1_time_s": 1.3623406887054443, "geo/step": 9900.0, "geo/rankme_slope": 0.0003709313608255802} {"step": 9910, "timestamp": 1778336423.459201, "train/loss": 2.4030921459198, "train/z_loss": 0.001345132675487548, "train/perplexity": 11.057314403183286, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789566.4662814229, "perf/iters_per_sec": 0.8533317881972422, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171877121925354, "data/tokens_consumed": 20784873472, "data/tokens_consumed_B": 20.784873472, "train/loss_slope": -2.1214294605271872e-06} {"step": 9920, "timestamp": 1778336433.8181806, "train/loss": 2.2932029724121095, "train/z_loss": 0.0013580585247837007, "train/perplexity": 9.906617541913546, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025778.1509903166, "perf/iters_per_sec": 0.9659662966682037, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352328062057494, "data/tokens_consumed": 20805844992, "data/tokens_consumed_B": 20.805844992, "train/loss_slope": -5.919548630392469e-06} {"step": 9930, "timestamp": 1778336444.1920598, "train/loss": 2.318761372566223, "train/z_loss": 0.001361585280392319, "train/perplexity": 10.163078239248023, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022484.80421055, "perf/iters_per_sec": 0.9643959065487623, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036918544769287, "data/tokens_consumed": 20826816512, "data/tokens_consumed_B": 20.826816512, "train/loss_slope": -6.7744289687757305e-06} {"step": 9940, "timestamp": 1778336454.5491867, "train/loss": 2.3348114252090455, "train/z_loss": 0.0013512419071048498, "train/perplexity": 10.327512237271266, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025874.6371833012, "perf/iters_per_sec": 0.9660123048702722, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351835012435913, "data/tokens_consumed": 20847788032, "data/tokens_consumed_B": 20.847788032, "train/loss_slope": -5.488130719390552e-06} {"step": 9950, "timestamp": 1778336464.891986, "grad/layer_0/attn": 0.0033807847648859024, "grad/layer_0/mlp": 0.003343357937410474, "grad/layer_0/attn_mlp_ratio": 1.01119435222212, "grad/layer_4/attn": 0.001887914608232677, "grad/layer_4/mlp": 0.002607759553939104, "grad/layer_4/attn_mlp_ratio": 0.7239603563084971, "grad/layer_8/attn": 0.0039559341967105865, "grad/layer_8/mlp": 0.003550382563844323, "grad/layer_8/attn_mlp_ratio": 1.1142275555241519, "grad/layer_12/attn": 0.008116021752357483, "grad/layer_12/mlp": 0.006650417111814022, "grad/layer_12/attn_mlp_ratio": 1.2203778340312113, "grad/layer_16/attn": 0.0035868308041244745, "grad/layer_16/mlp": 0.004445353522896767, "grad/layer_16/attn_mlp_ratio": 0.8068718730608332, "grad/layer_20/attn": 0.0029763015918433666, "grad/layer_20/mlp": 0.005704642739146948, "grad/layer_20/attn_mlp_ratio": 0.5217332049991112, "grad/layer_24/attn": 0.01131999958306551, "grad/layer_24/mlp": 0.009526067413389683, "grad/layer_24/attn_mlp_ratio": 1.1883182191554182, "grad/layer_27/attn": 0.00578177347779274, "grad/layer_27/mlp": 0.009218388237059116, "grad/layer_27/attn_mlp_ratio": 0.6272000339310139} {"step": 9950, "timestamp": 1778336464.907825, "train/loss": 2.406869053840637, "train/z_loss": 0.001356263633351773, "train/perplexity": 11.09915582740003, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025435.7178744331, "perf/iters_per_sec": 0.9658030118343511, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035407829284668, "data/tokens_consumed": 20868759552, "data/tokens_consumed_B": 20.868759552, "train/loss_slope": -2.4064516971105795e-06} {"step": 9960, "timestamp": 1778336475.272425, "train/loss": 2.3627236127853393, "train/z_loss": 0.0013566891895607114, "train/perplexity": 10.619836421097633, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024423.1835172453, "perf/iters_per_sec": 0.9653201978288867, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359256982803344, "data/tokens_consumed": 20889731072, "data/tokens_consumed_B": 20.889731072, "train/loss_slope": -2.4836009020136816e-06} {"step": 9970, "timestamp": 1778336485.634189, "train/loss": 2.31323356628418, "train/z_loss": 0.0013676962698809803, "train/perplexity": 10.107053700570711, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025393.837122387, "perf/iters_per_sec": 0.965783041535562, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354292392730713, "data/tokens_consumed": 20910702592, "data/tokens_consumed_B": 20.910702592, "train/loss_slope": -5.9021216557614175e-06} {"step": 9975, "timestamp": 1778336491.4125721, "eos/sharpness": 57.22529888153075, "eos/L0_probe": 2.323521137237549, "eos/L_plus": 2.599275827407837, "eos/L_minus": 2.6200194358825684, "eos/grad_norm": 0.20803068578243256, "eos/embed_grad_frac": 0.054975878447294235, "eos/time_s": 0.6098170280456543} {"step": 9975, "timestamp": 1778336492.7947154, "geo/rankme_last": 428.51983642578125, "geo/layer_0/stable_rank_q_proj": 20.774982452392578, "geo/layer_0/stable_rank_k_proj": 17.180505752563477, "geo/layer_0/stable_rank_o_proj": 44.52273178100586, "geo/layer_0/stable_rank_gate_proj": 126.34143829345703, "geo/layer_0/stable_rank_down_proj": 57.3456916809082, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060974426567554474, "geo/layer_0/attn_entropy_mean": 6.239912033081055, "geo/layer_0/attn_entropy_std": 0.44492411613464355, "geo/layer_7/stable_rank_q_proj": 42.29944610595703, "geo/layer_7/stable_rank_k_proj": 38.960567474365234, "geo/layer_7/stable_rank_o_proj": 88.56991577148438, "geo/layer_7/stable_rank_gate_proj": 78.70409393310547, "geo/layer_7/stable_rank_down_proj": 143.93199157714844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40806567668914795, "geo/layer_7/attn_entropy_mean": 4.707541465759277, "geo/layer_7/attn_entropy_std": 0.7757891416549683, "geo/layer_14/stable_rank_q_proj": 51.592063903808594, "geo/layer_14/stable_rank_k_proj": 42.62314987182617, "geo/layer_14/stable_rank_o_proj": 42.45048522949219, "geo/layer_14/stable_rank_gate_proj": 71.88013458251953, "geo/layer_14/stable_rank_down_proj": 126.43844604492188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38199806213378906, "geo/layer_14/attn_entropy_mean": 5.505751609802246, "geo/layer_14/attn_entropy_std": 0.4777525067329407, "geo/layer_21/stable_rank_q_proj": 39.106441497802734, "geo/layer_21/stable_rank_k_proj": 28.72046661376953, "geo/layer_21/stable_rank_o_proj": 65.48845672607422, "geo/layer_21/stable_rank_gate_proj": 60.97684097290039, "geo/layer_21/stable_rank_down_proj": 49.53330612182617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1380832940340042, "geo/layer_21/attn_entropy_mean": 5.853311538696289, "geo/layer_21/attn_entropy_std": 0.32522064447402954, "geo/layer_27/stable_rank_q_proj": 44.02853012084961, "geo/layer_27/stable_rank_k_proj": 30.171607971191406, "geo/layer_27/stable_rank_o_proj": 107.16778564453125, "geo/layer_27/stable_rank_gate_proj": 70.79215240478516, "geo/layer_27/stable_rank_down_proj": 129.3760528564453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09697144478559494, "geo/layer_27/attn_entropy_mean": 4.308872222900391, "geo/layer_27/attn_entropy_std": 0.6960240602493286, "attnres/final_alpha/block_0": 0.25720909237861633, "attnres/block_norm/0": 1.780182123184204, "attnres/final_alpha/block_1": 0.003851685207337141, "attnres/block_norm/1": 50522.1953125, "attnres/final_alpha/block_2": 0.008552102372050285, "attnres/block_norm/2": 29796.765625, "attnres/final_alpha/block_3": 0.01060915645211935, "attnres/block_norm/3": 71758.46875, "attnres/final_alpha/block_4": 0.011624574661254883, "attnres/block_norm/4": 17235.212890625, "attnres/final_alpha/block_5": 0.6069403290748596, "attnres/block_norm/5": 7171.6259765625, "attnres/final_alpha/block_6": 0.10121305286884308, "attnres/block_norm/6": 48072.5234375, "geo/tier1_time_s": 1.3626468181610107, "geo/step": 9975.0, "geo/rankme_slope": 0.0003737809381565126} {"step": 9980, "timestamp": 1778336497.9738057, "train/loss": 2.3386704206466673, "train/z_loss": 0.0013634650502353906, "train/perplexity": 10.367443056748348, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700572.0969129473, "perf/iters_per_sec": 0.810895966011499, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2332038164138794, "data/tokens_consumed": 20931674112, "data/tokens_consumed_B": 20.931674112, "train/loss_slope": -3.457176052268597e-06} {"step": 9990, "timestamp": 1778336508.3357677, "train/loss": 2.323620057106018, "train/z_loss": 0.0013606200809590518, "train/perplexity": 10.212577583839145, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024852.621295009, "perf/iters_per_sec": 0.9655249697184606, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357059955596923, "data/tokens_consumed": 20952645632, "data/tokens_consumed_B": 20.952645632, "train/loss_slope": -5.2671389289349945e-06} {"step": 10000, "timestamp": 1778336518.681716, "grad/layer_0/attn": 0.003778406884521246, "grad/layer_0/mlp": 0.0034799922723323107, "grad/layer_0/attn_mlp_ratio": 1.085751484561139, "grad/layer_4/attn": 0.003041083225980401, "grad/layer_4/mlp": 0.0027200111653655767, "grad/layer_4/attn_mlp_ratio": 1.1180406731042238, "grad/layer_8/attn": 0.004845635034143925, "grad/layer_8/mlp": 0.0034481934271752834, "grad/layer_8/attn_mlp_ratio": 1.4052677136463996, "grad/layer_12/attn": 0.008644646033644676, "grad/layer_12/mlp": 0.006987274158746004, "grad/layer_12/attn_mlp_ratio": 1.2371986147279295, "grad/layer_16/attn": 0.004477526061236858, "grad/layer_16/mlp": 0.004655833356082439, "grad/layer_16/attn_mlp_ratio": 0.9617023683240564, "grad/layer_20/attn": 0.0028468898963183165, "grad/layer_20/mlp": 0.005973042920231819, "grad/layer_20/attn_mlp_ratio": 0.47662303564119085, "grad/layer_24/attn": 0.008864742703735828, "grad/layer_24/mlp": 0.008949587121605873, "grad/layer_24/attn_mlp_ratio": 0.9905197283663302, "grad/layer_27/attn": 0.009579909034073353, "grad/layer_27/mlp": 0.007152473088353872, "grad/layer_27/attn_mlp_ratio": 1.3393841237562403} {"step": 10000, "timestamp": 1778336518.69771, "train/loss": 2.3128947973251344, "train/z_loss": 0.0013686489663086831, "train/perplexity": 10.103630324409082, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025188.6091685416, "perf/iters_per_sec": 0.9656851812212666, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355341672897338, "data/tokens_consumed": 20973617152, "data/tokens_consumed_B": 20.973617152, "train/loss_slope": -5.6387642214615015e-06} {"step": 10000, "timestamp": 1778336525.7913902, "geo/ww_alpha_mean": 7.670086894064376, "geo/ww_alpha_std": 4.521657278264401, "geo/ww_alpha_min": 1.3304096502050606, "geo/ww_alpha_max": 25.95998785706521, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9458932915882885, "geo/ww_alpha_by_type/k_proj": 4.373218107328995, "geo/ww_alpha_by_type/v_proj": 9.113420026279114, "geo/ww_alpha_by_type/o_proj": 8.507439271620536, "geo/ww_alpha_by_type/gate_proj": 7.595442733259113, "geo/ww_alpha_by_type/up_proj": 12.037988394565527, "geo/ww_alpha_by_type/down_proj": 8.213265670490397, "geo/twonn_id/layer_0": 0.7140015959739685, "geo/twonn_id/layer_7": 3.643446207046509, "geo/twonn_id/layer_14": 5.180261135101318, "geo/twonn_id/layer_21": 8.033085823059082, "geo/twonn_id/layer_27": 6.3058695793151855, "geo/tier2_time_s": 7.08735728263855} {"step": 10000, "timestamp": 1778336526.5658944, "eoc/jacobian_sigma/layer_0/attn": 1633.637451171875, "eoc/jacobian_sigma/layer_0/mlp": 10491.6953125, "eoc/jacobian_sigma/layer_0": 10491.6953125, "eoc/jacobian_sigma/layer_7/attn": 1.13225257396698, "eoc/jacobian_sigma/layer_7/mlp": 1.7462382316589355, "eoc/jacobian_sigma/layer_7": 1.7462382316589355, "eoc/jacobian_sigma/layer_14/attn": 1.8234198093414307, "eoc/jacobian_sigma/layer_14/mlp": 13.664511680603027, "eoc/jacobian_sigma/layer_14": 13.664511680603027, "eoc/jacobian_sigma/layer_21/attn": 1.0984314680099487, "eoc/jacobian_sigma/layer_21/mlp": 5.588784217834473, "eoc/jacobian_sigma/layer_21": 5.588784217834473, "eoc/jacobian_sigma/layer_27/attn": 3.9543018341064453, "eoc/jacobian_sigma/layer_27/mlp": 26.416006088256836, "eoc/jacobian_sigma/layer_27": 26.416006088256836, "eoc/layer0_sigma": 10491.6953125, "eoc/sigma_max": 26.416006088256836, "eoc/sigma_min": 1.7462382316589355, "eoc/sigma_mean": 11.853885054588318, "eoc/time_s": 0.768132209777832} {"step": 10010, "timestamp": 1778336536.9418602, "train/loss": 2.3240586280822755, "train/z_loss": 0.00135519509203732, "train/perplexity": 10.217057506270363, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1149818.441092487, "perf/iters_per_sec": 0.5482761579000888, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8238983869552612, "data/tokens_consumed": 20994588672, "data/tokens_consumed_B": 20.994588672, "train/loss_slope": -5.420124069406139e-06} {"step": 10020, "timestamp": 1778336547.330946, "train/loss": 2.3502052545547487, "train/z_loss": 0.001351042662281543, "train/perplexity": 10.487722156563372, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019582.6782271476, "perf/iters_per_sec": 0.9630120650420892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038408589363098, "data/tokens_consumed": 21015560192, "data/tokens_consumed_B": 21.015560192, "train/loss_slope": -4.9506144948524895e-06} {"step": 10030, "timestamp": 1778336557.6940866, "train/loss": 2.3296581983566282, "train/z_loss": 0.0013524705893360079, "train/perplexity": 10.274429116046122, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024856.0705832131, "perf/iters_per_sec": 0.9655266144672456, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357042312622071, "data/tokens_consumed": 21036531712, "data/tokens_consumed_B": 21.036531712, "train/loss_slope": -7.483217432232118e-06} {"step": 10040, "timestamp": 1778336568.0504284, "train/loss": 2.3390156030654907, "train/z_loss": 0.0013539482722990214, "train/perplexity": 10.371022333535862, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025797.606122137, "perf/iters_per_sec": 0.9659755735979734, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035222864151001, "data/tokens_consumed": 21057503232, "data/tokens_consumed_B": 21.057503232, "train/loss_slope": -8.587552332999725e-06} {"step": 10050, "timestamp": 1778336578.4103637, "grad/layer_0/attn": 0.00276758149266243, "grad/layer_0/mlp": 0.002975841285660863, "grad/layer_0/attn_mlp_ratio": 0.9300164672747894, "grad/layer_4/attn": 0.001918410649523139, "grad/layer_4/mlp": 0.0025995420292019844, "grad/layer_4/attn_mlp_ratio": 0.7379802111966767, "grad/layer_8/attn": 0.002889050170779228, "grad/layer_8/mlp": 0.0033010721672326326, "grad/layer_8/attn_mlp_ratio": 0.875185375205732, "grad/layer_12/attn": 0.011697343550622463, "grad/layer_12/mlp": 0.007360886316746473, "grad/layer_12/attn_mlp_ratio": 1.5891215932921179, "grad/layer_16/attn": 0.0034048932138830423, "grad/layer_16/mlp": 0.004556227941066027, "grad/layer_16/attn_mlp_ratio": 0.7473052672504942, "grad/layer_20/attn": 0.003331036539748311, "grad/layer_20/mlp": 0.006284896284341812, "grad/layer_20/attn_mlp_ratio": 0.5300065961385225, "grad/layer_24/attn": 0.01188892126083374, "grad/layer_24/mlp": 0.01125758420675993, "grad/layer_24/attn_mlp_ratio": 1.0560810327393868, "grad/layer_27/attn": 0.005451834294945002, "grad/layer_27/mlp": 0.012782135978341103, "grad/layer_27/attn_mlp_ratio": 0.4265198133966787} {"step": 10050, "timestamp": 1778336579.0214427, "eos/sharpness": 54.46586608886718, "eos/L0_probe": 2.320930004119873, "eos/L_plus": 2.579698085784912, "eos/L_minus": 2.606820583343506, "eos/grad_norm": 0.18173670768737793, "eos/embed_grad_frac": 0.08798452466726303, "eos/time_s": 0.608220100402832} {"step": 10050, "timestamp": 1778336579.042922, "train/loss": 2.325448489189148, "train/z_loss": 0.0013501193141564726, "train/perplexity": 10.231267669914397, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909010.0310315024, "perf/iters_per_sec": 0.9102869181783211, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0985547304153442, "data/tokens_consumed": 21078474752, "data/tokens_consumed_B": 21.078474752, "train/loss_slope": -1.0951517810701363e-05} {"step": 10050, "timestamp": 1778336580.4101386, "geo/rankme_last": 429.25201416015625, "geo/layer_0/stable_rank_q_proj": 20.78108024597168, "geo/layer_0/stable_rank_k_proj": 17.1492862701416, "geo/layer_0/stable_rank_o_proj": 44.51205062866211, "geo/layer_0/stable_rank_gate_proj": 126.35133361816406, "geo/layer_0/stable_rank_down_proj": 57.341392517089844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06475910544395447, "geo/layer_0/attn_entropy_mean": 6.240699768066406, "geo/layer_0/attn_entropy_std": 0.44179341197013855, "geo/layer_7/stable_rank_q_proj": 42.30220031738281, "geo/layer_7/stable_rank_k_proj": 39.04362487792969, "geo/layer_7/stable_rank_o_proj": 88.61133575439453, "geo/layer_7/stable_rank_gate_proj": 78.7091064453125, "geo/layer_7/stable_rank_down_proj": 143.60035705566406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39553555846214294, "geo/layer_7/attn_entropy_mean": 4.744837760925293, "geo/layer_7/attn_entropy_std": 0.7624614834785461, "geo/layer_14/stable_rank_q_proj": 51.58884048461914, "geo/layer_14/stable_rank_k_proj": 42.56181335449219, "geo/layer_14/stable_rank_o_proj": 42.46875762939453, "geo/layer_14/stable_rank_gate_proj": 71.95317840576172, "geo/layer_14/stable_rank_down_proj": 126.77351379394531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37960007786750793, "geo/layer_14/attn_entropy_mean": 5.543410778045654, "geo/layer_14/attn_entropy_std": 0.4560297727584839, "geo/layer_21/stable_rank_q_proj": 39.00789260864258, "geo/layer_21/stable_rank_k_proj": 28.729511260986328, "geo/layer_21/stable_rank_o_proj": 65.42761993408203, "geo/layer_21/stable_rank_gate_proj": 60.92294692993164, "geo/layer_21/stable_rank_down_proj": 49.52599334716797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13501282036304474, "geo/layer_21/attn_entropy_mean": 5.854381561279297, "geo/layer_21/attn_entropy_std": 0.3248880207538605, "geo/layer_27/stable_rank_q_proj": 43.99800109863281, "geo/layer_27/stable_rank_k_proj": 30.123239517211914, "geo/layer_27/stable_rank_o_proj": 107.20514678955078, "geo/layer_27/stable_rank_gate_proj": 70.8609619140625, "geo/layer_27/stable_rank_down_proj": 129.37197875976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11087420582771301, "geo/layer_27/attn_entropy_mean": 4.30405330657959, "geo/layer_27/attn_entropy_std": 0.6946455240249634, "attnres/final_alpha/block_0": 0.25653108954429626, "attnres/block_norm/0": 1.7802282571792603, "attnres/final_alpha/block_1": 0.0038887232076376677, "attnres/block_norm/1": 50556.7890625, "attnres/final_alpha/block_2": 0.00845686998218298, "attnres/block_norm/2": 29894.205078125, "attnres/final_alpha/block_3": 0.010387898422777653, "attnres/block_norm/3": 71484.890625, "attnres/final_alpha/block_4": 0.01168947946280241, "attnres/block_norm/4": 17289.6484375, "attnres/final_alpha/block_5": 0.6091889142990112, "attnres/block_norm/5": 7185.9482421875, "attnres/final_alpha/block_6": 0.09985701739788055, "attnres/block_norm/6": 48025.7109375, "geo/tier1_time_s": 1.3629059791564941, "geo/step": 10050.0, "geo/rankme_slope": 0.00036011131014905963} {"step": 10060, "timestamp": 1778336590.766187, "train/loss": 2.308755946159363, "train/z_loss": 0.0013493653968907893, "train/perplexity": 10.06189932103965, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789510.7990235835, "perf/iters_per_sec": 0.8533052439802091, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1719135761260986, "data/tokens_consumed": 21099446272, "data/tokens_consumed_B": 21.099446272, "train/loss_slope": -1.4098386366804391e-05} {"step": 10070, "timestamp": 1778336601.1304185, "train/loss": 2.336973023414612, "train/z_loss": 0.0013528787530958652, "train/perplexity": 10.349860314271092, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024410.5105306322, "perf/iters_per_sec": 0.9653141548779641, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035932183265686, "data/tokens_consumed": 21120417792, "data/tokens_consumed_B": 21.120417792, "train/loss_slope": -1.312894155912628e-05} {"step": 10080, "timestamp": 1778336611.4944627, "train/loss": 2.3640091180801392, "train/z_loss": 0.0013538095401600004, "train/perplexity": 10.633497055574628, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024786.0616564315, "perf/iters_per_sec": 0.9654932316095503, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035740041732788, "data/tokens_consumed": 21141389312, "data/tokens_consumed_B": 21.141389312, "train/loss_slope": -1.1876103958853188e-05} {"step": 10090, "timestamp": 1778336621.837745, "train/loss": 2.313498854637146, "train/z_loss": 0.0013606982422061265, "train/perplexity": 10.109735339888388, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028533.1577737476, "perf/iters_per_sec": 0.9672799862736452, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338268280029297, "data/tokens_consumed": 21162360832, "data/tokens_consumed_B": 21.162360832, "train/loss_slope": -1.0362020856512206e-05} {"step": 10100, "timestamp": 1778336632.1830227, "grad/layer_0/attn": 0.003748904448002577, "grad/layer_0/mlp": 0.0033901939168572426, "grad/layer_0/attn_mlp_ratio": 1.1058082308451078, "grad/layer_4/attn": 0.0020913733169436455, "grad/layer_4/mlp": 0.0025483286008238792, "grad/layer_4/attn_mlp_ratio": 0.8206842846715577, "grad/layer_8/attn": 0.004576403647661209, "grad/layer_8/mlp": 0.0033518371637910604, "grad/layer_8/attn_mlp_ratio": 1.365341837176521, "grad/layer_12/attn": 0.009047009982168674, "grad/layer_12/mlp": 0.006660250015556812, "grad/layer_12/attn_mlp_ratio": 1.3583588942158404, "grad/layer_16/attn": 0.005476616322994232, "grad/layer_16/mlp": 0.004781836178153753, "grad/layer_16/attn_mlp_ratio": 1.1452956572383384, "grad/layer_20/attn": 0.0053380983881652355, "grad/layer_20/mlp": 0.006850423291325569, "grad/layer_20/attn_mlp_ratio": 0.779236272450644, "grad/layer_24/attn": 0.01665828749537468, "grad/layer_24/mlp": 0.01231750100851059, "grad/layer_24/attn_mlp_ratio": 1.3524080370380394, "grad/layer_27/attn": 0.011587526649236679, "grad/layer_27/mlp": 0.01135947648435831, "grad/layer_27/attn_mlp_ratio": 1.0200757546516173} {"step": 10100, "timestamp": 1778336632.1986594, "train/loss": 2.3167397499084474, "train/z_loss": 0.0013641064870171249, "train/perplexity": 10.14255308405666, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025021.7434773971, "perf/iters_per_sec": 0.9656056134592996, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356194972991943, "data/tokens_consumed": 21183332352, "data/tokens_consumed_B": 21.183332352, "train/loss_slope": -1.3696743753125897e-05} {"step": 10110, "timestamp": 1778336642.5511794, "train/loss": 2.2744671583175657, "train/z_loss": 0.0013503569294698536, "train/perplexity": 9.72273695255804, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027008.4315625841, "perf/iters_per_sec": 0.9665529401600762, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346044778823853, "data/tokens_consumed": 21204303872, "data/tokens_consumed_B": 21.204303872, "train/loss_slope": -1.5342902479105942e-05} {"step": 10120, "timestamp": 1778336652.8943512, "train/loss": 2.336930346488953, "train/z_loss": 0.0013741280767135323, "train/perplexity": 10.349418623476948, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028586.6304704465, "perf/iters_per_sec": 0.9673054840423806, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337995767593384, "data/tokens_consumed": 21225275392, "data/tokens_consumed_B": 21.225275392, "train/loss_slope": -1.7734305135416188e-05} {"step": 10125, "timestamp": 1778336658.6625638, "eos/sharpness": 42.199945449829094, "eos/L0_probe": 2.318448066711426, "eos/L_plus": 2.5564892292022705, "eos/L_minus": 2.502406358718872, "eos/grad_norm": 0.12670600414276123, "eos/embed_grad_frac": 0.1529340296983719, "eos/time_s": 0.5985796451568604} {"step": 10125, "timestamp": 1778336660.0453808, "geo/rankme_last": 429.245849609375, "geo/layer_0/stable_rank_q_proj": 20.74735450744629, "geo/layer_0/stable_rank_k_proj": 17.141202926635742, "geo/layer_0/stable_rank_o_proj": 44.47221374511719, "geo/layer_0/stable_rank_gate_proj": 126.28458404541016, "geo/layer_0/stable_rank_down_proj": 57.33635330200195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061395276337862015, "geo/layer_0/attn_entropy_mean": 6.239592552185059, "geo/layer_0/attn_entropy_std": 0.4385633170604706, "geo/layer_7/stable_rank_q_proj": 42.3133544921875, "geo/layer_7/stable_rank_k_proj": 39.012630462646484, "geo/layer_7/stable_rank_o_proj": 88.69173431396484, "geo/layer_7/stable_rank_gate_proj": 78.59475708007812, "geo/layer_7/stable_rank_down_proj": 143.53500366210938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40547409653663635, "geo/layer_7/attn_entropy_mean": 4.756356239318848, "geo/layer_7/attn_entropy_std": 0.7708572149276733, "geo/layer_14/stable_rank_q_proj": 51.536888122558594, "geo/layer_14/stable_rank_k_proj": 42.60538101196289, "geo/layer_14/stable_rank_o_proj": 42.41421127319336, "geo/layer_14/stable_rank_gate_proj": 72.1003189086914, "geo/layer_14/stable_rank_down_proj": 126.89258575439453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38958171010017395, "geo/layer_14/attn_entropy_mean": 5.526307106018066, "geo/layer_14/attn_entropy_std": 0.4544951915740967, "geo/layer_21/stable_rank_q_proj": 38.934024810791016, "geo/layer_21/stable_rank_k_proj": 28.674901962280273, "geo/layer_21/stable_rank_o_proj": 65.41827392578125, "geo/layer_21/stable_rank_gate_proj": 60.92612838745117, "geo/layer_21/stable_rank_down_proj": 49.44347381591797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13981257379055023, "geo/layer_21/attn_entropy_mean": 5.846428871154785, "geo/layer_21/attn_entropy_std": 0.33109331130981445, "geo/layer_27/stable_rank_q_proj": 44.07082748413086, "geo/layer_27/stable_rank_k_proj": 30.16239356994629, "geo/layer_27/stable_rank_o_proj": 107.053466796875, "geo/layer_27/stable_rank_gate_proj": 70.86161804199219, "geo/layer_27/stable_rank_down_proj": 129.51589965820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09993034601211548, "geo/layer_27/attn_entropy_mean": 4.3245673179626465, "geo/layer_27/attn_entropy_std": 0.6810254454612732, "attnres/final_alpha/block_0": 0.2570979595184326, "attnres/block_norm/0": 1.7800973653793335, "attnres/final_alpha/block_1": 0.0038864403031766415, "attnres/block_norm/1": 50462.83984375, "attnres/final_alpha/block_2": 0.008480226621031761, "attnres/block_norm/2": 29995.9765625, "attnres/final_alpha/block_3": 0.01046108826994896, "attnres/block_norm/3": 72018.03125, "attnres/final_alpha/block_4": 0.011885233223438263, "attnres/block_norm/4": 17255.97265625, "attnres/final_alpha/block_5": 0.6098126173019409, "attnres/block_norm/5": 7130.5712890625, "attnres/final_alpha/block_6": 0.09837646782398224, "attnres/block_norm/6": 47855.2421875, "geo/tier1_time_s": 1.3612034320831299, "geo/step": 10125.0, "geo/rankme_slope": 0.0003583241695115546} {"step": 10130, "timestamp": 1778336665.2347612, "train/loss": 2.3584014177322388, "train/z_loss": 0.0013652365654706956, "train/perplexity": 10.574034470439482, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700132.0781857655, "perf/iters_per_sec": 0.8106861487320736, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2335229873657227, "data/tokens_consumed": 21246246912, "data/tokens_consumed_B": 21.246246912, "train/loss_slope": -1.5425032536403356e-05} {"step": 10140, "timestamp": 1778336675.5946457, "train/loss": 2.3310602426528932, "train/z_loss": 0.0013446837081573903, "train/perplexity": 10.288844423874332, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025553.907421191, "perf/iters_per_sec": 0.9658593690019565, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353474140167236, "data/tokens_consumed": 21267218432, "data/tokens_consumed_B": 21.267218432, "train/loss_slope": -1.3544938845424139e-05} {"step": 10150, "timestamp": 1778336685.9347887, "grad/layer_0/attn": 0.003280612640082836, "grad/layer_0/mlp": 0.0033159537706524134, "grad/layer_0/attn_mlp_ratio": 0.989342062058715, "grad/layer_4/attn": 0.002018394647166133, "grad/layer_4/mlp": 0.002743761520832777, "grad/layer_4/attn_mlp_ratio": 0.7356304687116065, "grad/layer_8/attn": 0.004316157195717096, "grad/layer_8/mlp": 0.00367994187399745, "grad/layer_8/attn_mlp_ratio": 1.1728872971952158, "grad/layer_12/attn": 0.006468860432505608, "grad/layer_12/mlp": 0.006952977739274502, "grad/layer_12/attn_mlp_ratio": 0.9303726521269327, "grad/layer_16/attn": 0.004194742068648338, "grad/layer_16/mlp": 0.004477700684219599, "grad/layer_16/attn_mlp_ratio": 0.9368071407163987, "grad/layer_20/attn": 0.006351484917104244, "grad/layer_20/mlp": 0.006260076072067022, "grad/layer_20/attn_mlp_ratio": 1.014601858272124, "grad/layer_24/attn": 0.015915872529149055, "grad/layer_24/mlp": 0.013479900546371937, "grad/layer_24/attn_mlp_ratio": 1.180711412248632, "grad/layer_27/attn": 0.005847861059010029, "grad/layer_27/mlp": 0.014608090743422508, "grad/layer_27/attn_mlp_ratio": 0.4003165863144333} {"step": 10150, "timestamp": 1778336685.9505715, "train/loss": 2.321121668815613, "train/z_loss": 0.0013577254372648894, "train/perplexity": 10.187094446230487, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026030.1168651842, "perf/iters_per_sec": 0.9660864433599397, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351040601730346, "data/tokens_consumed": 21288189952, "data/tokens_consumed_B": 21.288189952, "train/loss_slope": -1.557215619938456e-05} {"step": 10160, "timestamp": 1778336696.303147, "train/loss": 2.311156964302063, "train/z_loss": 0.0013505613896995783, "train/perplexity": 10.086087149947499, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026717.835960124, "perf/iters_per_sec": 0.9664143733788128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347528219223023, "data/tokens_consumed": 21309161472, "data/tokens_consumed_B": 21.309161472, "train/loss_slope": -1.6812339445652607e-05} {"step": 10170, "timestamp": 1778336706.6632261, "train/loss": 2.3526469707489013, "train/z_loss": 0.0013439822127111257, "train/perplexity": 10.51336148683837, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025485.4826677868, "perf/iters_per_sec": 0.9658267415369924, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353823900222778, "data/tokens_consumed": 21330132992, "data/tokens_consumed_B": 21.330132992, "train/loss_slope": -1.5128526564585756e-05} {"step": 10180, "timestamp": 1778336717.0287204, "train/loss": 2.330263924598694, "train/z_loss": 0.0013594251358881593, "train/perplexity": 10.280654492630594, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024082.0932432478, "perf/iters_per_sec": 0.965157553311943, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361002683639526, "data/tokens_consumed": 21351104512, "data/tokens_consumed_B": 21.351104512, "train/loss_slope": -1.6475175430636595e-05} {"step": 10190, "timestamp": 1778336727.3992913, "train/loss": 2.3302887678146362, "train/z_loss": 0.0013617540011182428, "train/perplexity": 10.280909900322744, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023213.6288374725, "perf/iters_per_sec": 0.9647434372126925, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365450143814088, "data/tokens_consumed": 21372076032, "data/tokens_consumed_B": 21.372076032, "train/loss_slope": -1.696138705285932e-05} {"step": 10200, "timestamp": 1778336737.7746627, "grad/layer_0/attn": 0.00245865760371089, "grad/layer_0/mlp": 0.0029967220034450293, "grad/layer_0/attn_mlp_ratio": 0.8204489835358504, "grad/layer_4/attn": 0.0019047021633014083, "grad/layer_4/mlp": 0.0026421211659908295, "grad/layer_4/attn_mlp_ratio": 0.7208988428421436, "grad/layer_8/attn": 0.005177045240998268, "grad/layer_8/mlp": 0.003452382981777191, "grad/layer_8/attn_mlp_ratio": 1.4995570069626423, "grad/layer_12/attn": 0.00667161587625742, "grad/layer_12/mlp": 0.006442762911319733, "grad/layer_12/attn_mlp_ratio": 1.035520919291242, "grad/layer_16/attn": 0.0033918828703463078, "grad/layer_16/mlp": 0.004350233357399702, "grad/layer_16/attn_mlp_ratio": 0.7797013432869333, "grad/layer_20/attn": 0.004829107318073511, "grad/layer_20/mlp": 0.005637797527015209, "grad/layer_20/attn_mlp_ratio": 0.8565591810059634, "grad/layer_24/attn": 0.0072137052193284035, "grad/layer_24/mlp": 0.008132043294608593, "grad/layer_24/attn_mlp_ratio": 0.8870716582883668, "grad/layer_27/attn": 0.004030316136777401, "grad/layer_27/mlp": 0.007852663286030293, "grad/layer_27/attn_mlp_ratio": 0.5132419331697371} {"step": 10200, "timestamp": 1778336738.375807, "eos/sharpness": 26.3416051864624, "eos/L0_probe": 2.31953501701355, "eos/L_plus": 2.4315273761749268, "eos/L_minus": 2.470958709716797, "eos/grad_norm": 0.10605625808238983, "eos/embed_grad_frac": 0.2587888240814209, "eos/time_s": 0.5979890823364258} {"step": 10200, "timestamp": 1778336738.3945093, "train/loss": 2.291187047958374, "train/z_loss": 0.0013650357723236084, "train/perplexity": 9.886666665843084, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908950.0820926663, "perf/iters_per_sec": 0.9102583322966892, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0985892295837403, "data/tokens_consumed": 21393047552, "data/tokens_consumed_B": 21.393047552, "train/loss_slope": -1.765930827873587e-05} {"step": 10200, "timestamp": 1778336739.764142, "geo/rankme_last": 430.32672119140625, "geo/layer_0/stable_rank_q_proj": 20.728923797607422, "geo/layer_0/stable_rank_k_proj": 17.166746139526367, "geo/layer_0/stable_rank_o_proj": 44.46181869506836, "geo/layer_0/stable_rank_gate_proj": 126.74099731445312, "geo/layer_0/stable_rank_down_proj": 57.39302062988281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06292145699262619, "geo/layer_0/attn_entropy_mean": 6.241267681121826, "geo/layer_0/attn_entropy_std": 0.44271400570869446, "geo/layer_7/stable_rank_q_proj": 42.243953704833984, "geo/layer_7/stable_rank_k_proj": 38.996917724609375, "geo/layer_7/stable_rank_o_proj": 88.72762298583984, "geo/layer_7/stable_rank_gate_proj": 78.53978729248047, "geo/layer_7/stable_rank_down_proj": 143.41705322265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39536362886428833, "geo/layer_7/attn_entropy_mean": 4.727336406707764, "geo/layer_7/attn_entropy_std": 0.7679290175437927, "geo/layer_14/stable_rank_q_proj": 51.413795471191406, "geo/layer_14/stable_rank_k_proj": 42.63608932495117, "geo/layer_14/stable_rank_o_proj": 42.4202995300293, "geo/layer_14/stable_rank_gate_proj": 72.13722229003906, "geo/layer_14/stable_rank_down_proj": 126.88523864746094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38224831223487854, "geo/layer_14/attn_entropy_mean": 5.509623050689697, "geo/layer_14/attn_entropy_std": 0.463627427816391, "geo/layer_21/stable_rank_q_proj": 38.96645736694336, "geo/layer_21/stable_rank_k_proj": 28.62882423400879, "geo/layer_21/stable_rank_o_proj": 65.3650131225586, "geo/layer_21/stable_rank_gate_proj": 60.92105484008789, "geo/layer_21/stable_rank_down_proj": 49.40793228149414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13752134144306183, "geo/layer_21/attn_entropy_mean": 5.847715854644775, "geo/layer_21/attn_entropy_std": 0.33219125866889954, "geo/layer_27/stable_rank_q_proj": 44.03681945800781, "geo/layer_27/stable_rank_k_proj": 30.254959106445312, "geo/layer_27/stable_rank_o_proj": 106.97802734375, "geo/layer_27/stable_rank_gate_proj": 70.81856536865234, "geo/layer_27/stable_rank_down_proj": 129.5580291748047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10202012956142426, "geo/layer_27/attn_entropy_mean": 4.287278175354004, "geo/layer_27/attn_entropy_std": 0.6917585730552673, "attnres/final_alpha/block_0": 0.25779989361763, "attnres/block_norm/0": 1.780247449874878, "attnres/final_alpha/block_1": 0.003956880886107683, "attnres/block_norm/1": 50518.71875, "attnres/final_alpha/block_2": 0.008635776117444038, "attnres/block_norm/2": 29866.419921875, "attnres/final_alpha/block_3": 0.010590963065624237, "attnres/block_norm/3": 71708.6875, "attnres/final_alpha/block_4": 0.011761799454689026, "attnres/block_norm/4": 17306.12109375, "attnres/final_alpha/block_5": 0.6072868704795837, "attnres/block_norm/5": 7182.9521484375, "attnres/final_alpha/block_6": 0.099967822432518, "attnres/block_norm/6": 47917.87890625, "geo/tier1_time_s": 1.365433931350708, "geo/step": 10200.0, "geo/rankme_slope": 0.000368027875212585} {"step": 10210, "timestamp": 1778336750.1297307, "train/loss": 2.3433003187179566, "train/z_loss": 0.0013594361953437328, "train/perplexity": 10.415554551075425, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787686.6200221425, "perf/iters_per_sec": 0.8524354076491082, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1731094121932983, "data/tokens_consumed": 21414019072, "data/tokens_consumed_B": 21.414019072, "train/loss_slope": -1.7514495938309932e-05} {"step": 10220, "timestamp": 1778336760.5195875, "train/loss": 2.3525604009628296, "train/z_loss": 0.0013549478841014206, "train/perplexity": 10.512451386777713, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020553.382326717, "perf/iters_per_sec": 0.9634749328263841, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379097223281861, "data/tokens_consumed": 21434990592, "data/tokens_consumed_B": 21.434990592, "train/loss_slope": -1.6541666030311502e-05} {"step": 10230, "timestamp": 1778336770.8790429, "train/loss": 2.3726046323776244, "train/z_loss": 0.0013554187957197429, "train/perplexity": 10.725291376076978, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025711.9037038884, "perf/iters_per_sec": 0.9659347075004046, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352666616439818, "data/tokens_consumed": 21455962112, "data/tokens_consumed_B": 21.455962112, "train/loss_slope": -1.4654187271983418e-05} {"step": 10240, "timestamp": 1778336781.2427015, "train/loss": 2.377668786048889, "train/z_loss": 0.0013581194798462093, "train/perplexity": 10.779743660771262, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024548.710563694, "perf/iters_per_sec": 0.9653800537889928, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358614683151246, "data/tokens_consumed": 21476933632, "data/tokens_consumed_B": 21.476933632, "train/loss_slope": -9.969687039809954e-06} {"step": 10250, "timestamp": 1778336791.592169, "grad/layer_0/attn": 0.0027098176069557667, "grad/layer_0/mlp": 0.0029884667601436377, "grad/layer_0/attn_mlp_ratio": 0.9067584596957926, "grad/layer_4/attn": 0.0022828150540590286, "grad/layer_4/mlp": 0.0024566808715462685, "grad/layer_4/attn_mlp_ratio": 0.9292273113599254, "grad/layer_8/attn": 0.003279809607192874, "grad/layer_8/mlp": 0.0032656744588166475, "grad/layer_8/attn_mlp_ratio": 1.0043283701794672, "grad/layer_12/attn": 0.009418154135346413, "grad/layer_12/mlp": 0.007293602451682091, "grad/layer_12/attn_mlp_ratio": 1.291289739002072, "grad/layer_16/attn": 0.0031287765596061945, "grad/layer_16/mlp": 0.004043243825435638, "grad/layer_16/attn_mlp_ratio": 0.7738282966118809, "grad/layer_20/attn": 0.003890588879585266, "grad/layer_20/mlp": 0.005524453707039356, "grad/layer_20/attn_mlp_ratio": 0.7042486036588467, "grad/layer_24/attn": 0.004941768478602171, "grad/layer_24/mlp": 0.007570079993456602, "grad/layer_24/attn_mlp_ratio": 0.652802667553507, "grad/layer_27/attn": 0.004621957894414663, "grad/layer_27/mlp": 0.006421813275665045, "grad/layer_27/attn_mlp_ratio": 0.7197278438406834} {"step": 10250, "timestamp": 1778336791.6082447, "train/loss": 2.33016300201416, "train/z_loss": 0.0013493440579622983, "train/perplexity": 10.279616994762861, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024624.2952318462, "perf/iters_per_sec": 0.9654160953673583, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358227968215943, "data/tokens_consumed": 21497905152, "data/tokens_consumed_B": 21.497905152, "train/loss_slope": -9.62865174514126e-06} {"step": 10260, "timestamp": 1778336801.9637043, "train/loss": 2.3202460050582885, "train/z_loss": 0.001360472443047911, "train/perplexity": 10.178177881357591, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026501.5546618246, "perf/iters_per_sec": 0.9663112424191592, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034863257408142, "data/tokens_consumed": 21518876672, "data/tokens_consumed_B": 21.518876672, "train/loss_slope": -9.41614774194095e-06} {"step": 10270, "timestamp": 1778336812.3213053, "train/loss": 2.374223065376282, "train/z_loss": 0.001349254639353603, "train/perplexity": 10.742663595654992, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025740.641431232, "perf/iters_per_sec": 0.9659484107166443, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352519750595093, "data/tokens_consumed": 21539848192, "data/tokens_consumed_B": 21.539848192, "train/loss_slope": -5.865617088823762e-06} {"step": 10275, "timestamp": 1778336818.1038833, "eos/sharpness": 49.38180446624755, "eos/L0_probe": 2.32043719291687, "eos/L_plus": 2.5619828701019287, "eos/L_minus": 2.572709560394287, "eos/grad_norm": 0.19317185878753662, "eos/embed_grad_frac": 0.10895530134439468, "eos/time_s": 0.6093270778656006} {"step": 10275, "timestamp": 1778336819.484617, "geo/rankme_last": 429.3387756347656, "geo/layer_0/stable_rank_q_proj": 20.70343589782715, "geo/layer_0/stable_rank_k_proj": 17.14366340637207, "geo/layer_0/stable_rank_o_proj": 44.3946647644043, "geo/layer_0/stable_rank_gate_proj": 126.53700256347656, "geo/layer_0/stable_rank_down_proj": 57.372169494628906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06325278431177139, "geo/layer_0/attn_entropy_mean": 6.237880706787109, "geo/layer_0/attn_entropy_std": 0.4495631158351898, "geo/layer_7/stable_rank_q_proj": 42.18743896484375, "geo/layer_7/stable_rank_k_proj": 39.096923828125, "geo/layer_7/stable_rank_o_proj": 88.65801239013672, "geo/layer_7/stable_rank_gate_proj": 78.56031036376953, "geo/layer_7/stable_rank_down_proj": 143.63900756835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.41016077995300293, "geo/layer_7/attn_entropy_mean": 4.749728202819824, "geo/layer_7/attn_entropy_std": 0.7721642851829529, "geo/layer_14/stable_rank_q_proj": 51.390098571777344, "geo/layer_14/stable_rank_k_proj": 42.642433166503906, "geo/layer_14/stable_rank_o_proj": 42.427711486816406, "geo/layer_14/stable_rank_gate_proj": 72.09930419921875, "geo/layer_14/stable_rank_down_proj": 126.70237731933594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3715401291847229, "geo/layer_14/attn_entropy_mean": 5.511163711547852, "geo/layer_14/attn_entropy_std": 0.4536212682723999, "geo/layer_21/stable_rank_q_proj": 38.95359420776367, "geo/layer_21/stable_rank_k_proj": 28.58966064453125, "geo/layer_21/stable_rank_o_proj": 65.44013977050781, "geo/layer_21/stable_rank_gate_proj": 60.8602294921875, "geo/layer_21/stable_rank_down_proj": 49.427581787109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13472576439380646, "geo/layer_21/attn_entropy_mean": 5.851460933685303, "geo/layer_21/attn_entropy_std": 0.3252579867839813, "geo/layer_27/stable_rank_q_proj": 44.08253860473633, "geo/layer_27/stable_rank_k_proj": 30.320621490478516, "geo/layer_27/stable_rank_o_proj": 107.03268432617188, "geo/layer_27/stable_rank_gate_proj": 70.7364501953125, "geo/layer_27/stable_rank_down_proj": 129.47288513183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1096813753247261, "geo/layer_27/attn_entropy_mean": 4.312216281890869, "geo/layer_27/attn_entropy_std": 0.6868957281112671, "attnres/final_alpha/block_0": 0.2589517831802368, "attnres/block_norm/0": 1.7804102897644043, "attnres/final_alpha/block_1": 0.003901056479662657, "attnres/block_norm/1": 50469.63671875, "attnres/final_alpha/block_2": 0.008660290390253067, "attnres/block_norm/2": 29862.94140625, "attnres/final_alpha/block_3": 0.01054843794554472, "attnres/block_norm/3": 71612.25, "attnres/final_alpha/block_4": 0.012075806967914104, "attnres/block_norm/4": 17299.45703125, "attnres/final_alpha/block_5": 0.6052237749099731, "attnres/block_norm/5": 7158.22216796875, "attnres/final_alpha/block_6": 0.10063887387514114, "attnres/block_norm/6": 47852.1875, "geo/tier1_time_s": 1.3596539497375488, "geo/step": 10275.0, "geo/rankme_slope": 0.00038275142088085233} {"step": 10280, "timestamp": 1778336824.665756, "train/loss": 2.326624369621277, "train/z_loss": 0.001366809045430273, "train/perplexity": 10.243305493496868, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699754.1995539023, "perf/iters_per_sec": 0.8105059621591102, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2337972164154052, "data/tokens_consumed": 21560819712, "data/tokens_consumed_B": 21.560819712, "train/loss_slope": -1.0021895958860122e-05} {"step": 10290, "timestamp": 1778336835.0224724, "train/loss": 2.336689066886902, "train/z_loss": 0.001357099285814911, "train/perplexity": 10.346921821095874, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025677.5221371325, "perf/iters_per_sec": 0.9659183130918181, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352842330932617, "data/tokens_consumed": 21581791232, "data/tokens_consumed_B": 21.581791232, "train/loss_slope": -6.7905380959772755e-06} {"step": 10300, "timestamp": 1778336845.3774836, "grad/layer_0/attn": 0.003491603070870042, "grad/layer_0/mlp": 0.0035713897086679935, "grad/layer_0/attn_mlp_ratio": 0.9776594709420113, "grad/layer_4/attn": 0.0019915096927434206, "grad/layer_4/mlp": 0.0026210893411189318, "grad/layer_4/attn_mlp_ratio": 0.7598022644710871, "grad/layer_8/attn": 0.003699640044942498, "grad/layer_8/mlp": 0.003642637748271227, "grad/layer_8/attn_mlp_ratio": 1.0156486038540242, "grad/layer_12/attn": 0.006531706545501947, "grad/layer_12/mlp": 0.007870920933783054, "grad/layer_12/attn_mlp_ratio": 0.8298528872881563, "grad/layer_16/attn": 0.0042627546936273575, "grad/layer_16/mlp": 0.004794897511601448, "grad/layer_16/attn_mlp_ratio": 0.8890189194683639, "grad/layer_20/attn": 0.003025464015081525, "grad/layer_20/mlp": 0.006165775004774332, "grad/layer_20/attn_mlp_ratio": 0.4906867285410432, "grad/layer_24/attn": 0.010817037895321846, "grad/layer_24/mlp": 0.010884759947657585, "grad/layer_24/attn_mlp_ratio": 0.99377825950786, "grad/layer_27/attn": 0.005297907628118992, "grad/layer_27/mlp": 0.011160851456224918, "grad/layer_27/attn_mlp_ratio": 0.4746866851001263} {"step": 10300, "timestamp": 1778336845.3939624, "train/loss": 2.3578285217285155, "train/z_loss": 0.001369555643759668, "train/perplexity": 10.567978383267844, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023004.7480790052, "perf/iters_per_sec": 0.9646438351054216, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366520404815673, "data/tokens_consumed": 21602762752, "data/tokens_consumed_B": 21.602762752, "train/loss_slope": -7.824443275779028e-06} {"step": 10310, "timestamp": 1778336855.7597916, "train/loss": 2.3630450963974, "train/z_loss": 0.0013671266962774098, "train/perplexity": 10.623251073317736, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025057.7345332096, "perf/iters_per_sec": 0.965622775332074, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356010913848877, "data/tokens_consumed": 21623734272, "data/tokens_consumed_B": 21.623734272, "train/loss_slope": -7.370985630858347e-06} {"step": 10320, "timestamp": 1778336866.1174078, "train/loss": 2.3663817167282106, "train/z_loss": 0.0013514308142475785, "train/perplexity": 10.65875602916722, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025738.728667557, "perf/iters_per_sec": 0.9659474986398492, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352529525756835, "data/tokens_consumed": 21644705792, "data/tokens_consumed_B": 21.644705792, "train/loss_slope": -7.177718179990635e-06} {"step": 10330, "timestamp": 1778336877.13149, "train/loss": 2.3835704565048217, "train/z_loss": 0.0013471714220941067, "train/perplexity": 10.84355025299968, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904969.6208004889, "perf/iters_per_sec": 0.9083603004457897, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.100884747505188, "data/tokens_consumed": 21665677312, "data/tokens_consumed_B": 21.665677312, "train/loss_slope": -5.668870243195006e-06} {"step": 10340, "timestamp": 1778336887.4905891, "train/loss": 2.391826128959656, "train/z_loss": 0.001347498467657715, "train/perplexity": 10.93344159833453, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025781.7433965993, "perf/iters_per_sec": 0.9659680096610066, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352309703826905, "data/tokens_consumed": 21686648832, "data/tokens_consumed_B": 21.686648832, "train/loss_slope": 3.748305309580322e-07} {"step": 10350, "timestamp": 1778336897.82925, "grad/layer_0/attn": 0.002933684503659606, "grad/layer_0/mlp": 0.003085390431806445, "grad/layer_0/attn_mlp_ratio": 0.9508308505575086, "grad/layer_4/attn": 0.0019499191548675299, "grad/layer_4/mlp": 0.0026816665194928646, "grad/layer_4/attn_mlp_ratio": 0.7271295919834669, "grad/layer_8/attn": 0.003745579393580556, "grad/layer_8/mlp": 0.0034901590552181005, "grad/layer_8/attn_mlp_ratio": 1.0731829773380333, "grad/layer_12/attn": 0.006866522133350372, "grad/layer_12/mlp": 0.006897099781781435, "grad/layer_12/attn_mlp_ratio": 0.9955665788584804, "grad/layer_16/attn": 0.005747713148593903, "grad/layer_16/mlp": 0.004980027675628662, "grad/layer_16/attn_mlp_ratio": 1.15415282957299, "grad/layer_20/attn": 0.0036561067681759596, "grad/layer_20/mlp": 0.007164520211517811, "grad/layer_20/attn_mlp_ratio": 0.5103072654143135, "grad/layer_24/attn": 0.016213247552514076, "grad/layer_24/mlp": 0.014560351148247719, "grad/layer_24/attn_mlp_ratio": 1.1135203592334544, "grad/layer_27/attn": 0.006509743630886078, "grad/layer_27/mlp": 0.015282204374670982, "grad/layer_27/attn_mlp_ratio": 0.4259688869937221} {"step": 10350, "timestamp": 1778336898.4334497, "eos/sharpness": 62.85667419433592, "eos/L0_probe": 2.319045305252075, "eos/L_plus": 2.618335008621216, "eos/L_minus": 2.648322343826294, "eos/grad_norm": 0.25392916798591614, "eos/embed_grad_frac": 0.03735089302062988, "eos/time_s": 0.6011536121368408} {"step": 10350, "timestamp": 1778336898.464072, "train/loss": 2.2911877155303957, "train/z_loss": 0.0013655902002938092, "train/perplexity": 9.88667326590734, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912173.891160761, "perf/iters_per_sec": 0.9117955642513089, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967370748519898, "data/tokens_consumed": 21707620352, "data/tokens_consumed_B": 21.707620352, "train/loss_slope": -3.0470288125309386e-06} {"step": 10350, "timestamp": 1778336899.825984, "geo/rankme_last": 429.08953857421875, "geo/layer_0/stable_rank_q_proj": 20.706857681274414, "geo/layer_0/stable_rank_k_proj": 17.18527603149414, "geo/layer_0/stable_rank_o_proj": 44.4299430847168, "geo/layer_0/stable_rank_gate_proj": 126.55783081054688, "geo/layer_0/stable_rank_down_proj": 57.364967346191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06459347903728485, "geo/layer_0/attn_entropy_mean": 6.2415242195129395, "geo/layer_0/attn_entropy_std": 0.44696852564811707, "geo/layer_7/stable_rank_q_proj": 42.15258026123047, "geo/layer_7/stable_rank_k_proj": 39.06267547607422, "geo/layer_7/stable_rank_o_proj": 88.70752716064453, "geo/layer_7/stable_rank_gate_proj": 78.6074447631836, "geo/layer_7/stable_rank_down_proj": 143.8963165283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3980311453342438, "geo/layer_7/attn_entropy_mean": 4.737554550170898, "geo/layer_7/attn_entropy_std": 0.7737945318222046, "geo/layer_14/stable_rank_q_proj": 51.36808395385742, "geo/layer_14/stable_rank_k_proj": 42.68544387817383, "geo/layer_14/stable_rank_o_proj": 42.4708137512207, "geo/layer_14/stable_rank_gate_proj": 72.07678985595703, "geo/layer_14/stable_rank_down_proj": 126.8327865600586, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37371328473091125, "geo/layer_14/attn_entropy_mean": 5.510298728942871, "geo/layer_14/attn_entropy_std": 0.44240522384643555, "geo/layer_21/stable_rank_q_proj": 38.9473762512207, "geo/layer_21/stable_rank_k_proj": 28.630481719970703, "geo/layer_21/stable_rank_o_proj": 65.5375747680664, "geo/layer_21/stable_rank_gate_proj": 60.812339782714844, "geo/layer_21/stable_rank_down_proj": 49.356964111328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14039181172847748, "geo/layer_21/attn_entropy_mean": 5.855006217956543, "geo/layer_21/attn_entropy_std": 0.3171772360801697, "geo/layer_27/stable_rank_q_proj": 44.05609893798828, "geo/layer_27/stable_rank_k_proj": 30.303876876831055, "geo/layer_27/stable_rank_o_proj": 107.09444427490234, "geo/layer_27/stable_rank_gate_proj": 70.82564544677734, "geo/layer_27/stable_rank_down_proj": 129.59193420410156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10026820749044418, "geo/layer_27/attn_entropy_mean": 4.293970108032227, "geo/layer_27/attn_entropy_std": 0.6792951226234436, "attnres/final_alpha/block_0": 0.258000910282135, "attnres/block_norm/0": 1.7807228565216064, "attnres/final_alpha/block_1": 0.0039477902464568615, "attnres/block_norm/1": 50510.0390625, "attnres/final_alpha/block_2": 0.008439924567937851, "attnres/block_norm/2": 29820.6015625, "attnres/final_alpha/block_3": 0.010324889793992043, "attnres/block_norm/3": 71911.484375, "attnres/final_alpha/block_4": 0.012096894904971123, "attnres/block_norm/4": 17338.66796875, "attnres/final_alpha/block_5": 0.6052042245864868, "attnres/block_norm/5": 7191.52197265625, "attnres/final_alpha/block_6": 0.1019853800535202, "attnres/block_norm/6": 47723.5546875, "geo/tier1_time_s": 1.358804702758789, "geo/step": 10350.0, "geo/rankme_slope": 0.0003727571692739596} {"step": 10360, "timestamp": 1778336910.211877, "train/loss": 2.35941686630249, "train/z_loss": 0.0013563752872869372, "train/perplexity": 10.584777312102773, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785574.10679569, "perf/iters_per_sec": 0.8514280828455401, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1744973182678222, "data/tokens_consumed": 21728591872, "data/tokens_consumed_B": 21.728591872, "train/loss_slope": -8.250812635814e-07} {"step": 10370, "timestamp": 1778336920.570787, "train/loss": 2.3448079586029054, "train/z_loss": 0.0013535017729736864, "train/perplexity": 10.431269299654751, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025574.0579229759, "perf/iters_per_sec": 0.9658689775099639, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353371143341064, "data/tokens_consumed": 21749563392, "data/tokens_consumed_B": 21.749563392, "train/loss_slope": 4.1975837693783285e-07} {"step": 10380, "timestamp": 1778336930.923524, "train/loss": 2.3255016088485716, "train/z_loss": 0.0013657609117217362, "train/perplexity": 10.231811165803524, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026972.9783596422, "perf/iters_per_sec": 0.9665360347555362, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034622573852539, "data/tokens_consumed": 21770534912, "data/tokens_consumed_B": 21.770534912, "train/loss_slope": -1.3097147307809644e-06} {"step": 10390, "timestamp": 1778336941.2731175, "train/loss": 2.350985622406006, "train/z_loss": 0.0013563420390710235, "train/perplexity": 10.495909631973072, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027641.9403406093, "perf/iters_per_sec": 0.9668550206854865, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342812299728394, "data/tokens_consumed": 21791506432, "data/tokens_consumed_B": 21.791506432, "train/loss_slope": 2.5052505250524927e-06} {"step": 10400, "timestamp": 1778336951.6201289, "grad/layer_0/attn": 0.003254162147641182, "grad/layer_0/mlp": 0.0035391112323850393, "grad/layer_0/attn_mlp_ratio": 0.9194856680160363, "grad/layer_4/attn": 0.0018927882192656398, "grad/layer_4/mlp": 0.0027052804362028837, "grad/layer_4/attn_mlp_ratio": 0.6996643024395343, "grad/layer_8/attn": 0.004904433619230986, "grad/layer_8/mlp": 0.003570281434804201, "grad/layer_8/attn_mlp_ratio": 1.3736825993751642, "grad/layer_12/attn": 0.005652087274938822, "grad/layer_12/mlp": 0.007276579737663269, "grad/layer_12/attn_mlp_ratio": 0.7767505340467589, "grad/layer_16/attn": 0.004571347497403622, "grad/layer_16/mlp": 0.004736979492008686, "grad/layer_16/attn_mlp_ratio": 0.9650342393527541, "grad/layer_20/attn": 0.002823031973093748, "grad/layer_20/mlp": 0.006777494214475155, "grad/layer_20/attn_mlp_ratio": 0.4165303343839636, "grad/layer_24/attn": 0.0071325902827084064, "grad/layer_24/mlp": 0.009289340116083622, "grad/layer_24/attn_mlp_ratio": 0.7678252832595145, "grad/layer_27/attn": 0.007685443386435509, "grad/layer_27/mlp": 0.009216724894940853, "grad/layer_27/attn_mlp_ratio": 0.8338583814374543} {"step": 10400, "timestamp": 1778336951.6358016, "train/loss": 2.3581863403320313, "train/z_loss": 0.0013648850959725677, "train/perplexity": 10.571760479146711, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024885.0171000375, "perf/iters_per_sec": 0.9655404172420681, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356894254684448, "data/tokens_consumed": 21812477952, "data/tokens_consumed_B": 21.812477952, "train/loss_slope": 2.8959434811431525e-06} {"step": 10410, "timestamp": 1778336961.9831305, "train/loss": 2.3032241582870485, "train/z_loss": 0.0013518003863282502, "train/perplexity": 10.006392695387337, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027722.009861596, "perf/iters_per_sec": 0.9668932008083324, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342403888702392, "data/tokens_consumed": 21833449472, "data/tokens_consumed_B": 21.833449472, "train/loss_slope": -7.845358987344702e-08} {"step": 10420, "timestamp": 1778336972.3432105, "train/loss": 2.3734500885009764, "train/z_loss": 0.001355572883039713, "train/perplexity": 10.734362973624101, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025842.4896502562, "perf/iters_per_sec": 0.9659969757319719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351999282836915, "data/tokens_consumed": 21854420992, "data/tokens_consumed_B": 21.854420992, "train/loss_slope": 1.7498327238653337e-06} {"step": 10425, "timestamp": 1778336978.113451, "eos/sharpness": 56.7298650741577, "eos/L0_probe": 2.3230092525482178, "eos/L_plus": 2.596158981323242, "eos/L_minus": 2.6171581745147705, "eos/grad_norm": 0.20125344395637512, "eos/embed_grad_frac": 0.05963932350277901, "eos/time_s": 0.6021709442138672} {"step": 10425, "timestamp": 1778336979.4913712, "geo/rankme_last": 429.6427001953125, "geo/layer_0/stable_rank_q_proj": 20.692825317382812, "geo/layer_0/stable_rank_k_proj": 17.191448211669922, "geo/layer_0/stable_rank_o_proj": 44.388404846191406, "geo/layer_0/stable_rank_gate_proj": 126.45579528808594, "geo/layer_0/stable_rank_down_proj": 57.44874954223633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0635298416018486, "geo/layer_0/attn_entropy_mean": 6.242845058441162, "geo/layer_0/attn_entropy_std": 0.4478680491447449, "geo/layer_7/stable_rank_q_proj": 42.16983413696289, "geo/layer_7/stable_rank_k_proj": 39.00955581665039, "geo/layer_7/stable_rank_o_proj": 88.73265075683594, "geo/layer_7/stable_rank_gate_proj": 78.60784912109375, "geo/layer_7/stable_rank_down_proj": 143.93704223632812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3949149549007416, "geo/layer_7/attn_entropy_mean": 4.702439308166504, "geo/layer_7/attn_entropy_std": 0.7624050974845886, "geo/layer_14/stable_rank_q_proj": 51.39008331298828, "geo/layer_14/stable_rank_k_proj": 42.81180191040039, "geo/layer_14/stable_rank_o_proj": 42.54463195800781, "geo/layer_14/stable_rank_gate_proj": 71.90557861328125, "geo/layer_14/stable_rank_down_proj": 126.6094970703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3792869746685028, "geo/layer_14/attn_entropy_mean": 5.518960952758789, "geo/layer_14/attn_entropy_std": 0.4683716893196106, "geo/layer_21/stable_rank_q_proj": 39.03062057495117, "geo/layer_21/stable_rank_k_proj": 28.581274032592773, "geo/layer_21/stable_rank_o_proj": 65.58780670166016, "geo/layer_21/stable_rank_gate_proj": 60.793941497802734, "geo/layer_21/stable_rank_down_proj": 49.298091888427734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13834956288337708, "geo/layer_21/attn_entropy_mean": 5.841411590576172, "geo/layer_21/attn_entropy_std": 0.329878568649292, "geo/layer_27/stable_rank_q_proj": 44.086524963378906, "geo/layer_27/stable_rank_k_proj": 30.330949783325195, "geo/layer_27/stable_rank_o_proj": 107.2671890258789, "geo/layer_27/stable_rank_gate_proj": 70.79090118408203, "geo/layer_27/stable_rank_down_proj": 129.69805908203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09914758056402206, "geo/layer_27/attn_entropy_mean": 4.322208404541016, "geo/layer_27/attn_entropy_std": 0.6755841970443726, "attnres/final_alpha/block_0": 0.2599230110645294, "attnres/block_norm/0": 1.7805476188659668, "attnres/final_alpha/block_1": 0.0038842852227389812, "attnres/block_norm/1": 50630.09765625, "attnres/final_alpha/block_2": 0.008502010256052017, "attnres/block_norm/2": 29948.36328125, "attnres/final_alpha/block_3": 0.010558092966675758, "attnres/block_norm/3": 71167.796875, "attnres/final_alpha/block_4": 0.012236353009939194, "attnres/block_norm/4": 17325.49609375, "attnres/final_alpha/block_5": 0.6029945611953735, "attnres/block_norm/5": 7216.66162109375, "attnres/final_alpha/block_6": 0.10190165042877197, "attnres/block_norm/6": 47816.73046875, "geo/tier1_time_s": 1.3580100536346436, "geo/step": 10425.0, "geo/rankme_slope": 0.0003933291480654762} {"step": 10430, "timestamp": 1778336984.6710913, "train/loss": 2.358365273475647, "train/z_loss": 0.001357916695997119, "train/perplexity": 10.573652286731289, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702015.20110434, "perf/iters_per_sec": 0.8115840917131137, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2321582078933715, "data/tokens_consumed": 21875392512, "data/tokens_consumed_B": 21.875392512, "train/loss_slope": 5.192648795786249e-06} {"step": 10440, "timestamp": 1778336995.0207574, "train/loss": 2.3887558221817016, "train/z_loss": 0.0013557435129769146, "train/perplexity": 10.89992405938252, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027132.2238799597, "perf/iters_per_sec": 0.9666119689369009, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345412969589234, "data/tokens_consumed": 21896364032, "data/tokens_consumed_B": 21.896364032, "train/loss_slope": 7.53204412657757e-06} {"step": 10450, "timestamp": 1778337005.362853, "grad/layer_0/attn": 0.0030734469182789326, "grad/layer_0/mlp": 0.0033781426027417183, "grad/layer_0/attn_mlp_ratio": 0.9098037557100553, "grad/layer_4/attn": 0.004392686765640974, "grad/layer_4/mlp": 0.0025521647185087204, "grad/layer_4/attn_mlp_ratio": 1.7211610840273657, "grad/layer_8/attn": 0.00323917786590755, "grad/layer_8/mlp": 0.003400343470275402, "grad/layer_8/attn_mlp_ratio": 0.9526031117041508, "grad/layer_12/attn": 0.006575206760317087, "grad/layer_12/mlp": 0.006997330114245415, "grad/layer_12/attn_mlp_ratio": 0.9396736410882892, "grad/layer_16/attn": 0.003267431864514947, "grad/layer_16/mlp": 0.004321792162954807, "grad/layer_16/attn_mlp_ratio": 0.7560363075575053, "grad/layer_20/attn": 0.00303027150221169, "grad/layer_20/mlp": 0.006372018251568079, "grad/layer_20/attn_mlp_ratio": 0.4755591297796523, "grad/layer_24/attn": 0.011517302133142948, "grad/layer_24/mlp": 0.012281480245292187, "grad/layer_24/attn_mlp_ratio": 0.9377780047140516, "grad/layer_27/attn": 0.004116683267056942, "grad/layer_27/mlp": 0.010441267862915993, "grad/layer_27/attn_mlp_ratio": 0.3942704355139692} {"step": 10450, "timestamp": 1778337005.3786943, "train/loss": 2.362710165977478, "train/z_loss": 0.001357076782733202, "train/perplexity": 10.619693619157875, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026113.0460094633, "perf/iters_per_sec": 0.9661259870574299, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350616931915284, "data/tokens_consumed": 21917335552, "data/tokens_consumed_B": 21.917335552, "train/loss_slope": 9.010151703723219e-06} {"step": 10460, "timestamp": 1778337015.7371473, "train/loss": 2.329748106002808, "train/z_loss": 0.0013700031558983028, "train/perplexity": 10.275352907311111, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026341.8480429237, "perf/iters_per_sec": 0.9662350883688563, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349448204040528, "data/tokens_consumed": 21938307072, "data/tokens_consumed_B": 21.938307072, "train/loss_slope": 7.477845209981812e-06} {"step": 10470, "timestamp": 1778337026.0833879, "train/loss": 2.330328440666199, "train/z_loss": 0.0013683809083886445, "train/perplexity": 10.281317781425997, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027920.0840577357, "perf/iters_per_sec": 0.9669876499451331, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034139370918274, "data/tokens_consumed": 21959278592, "data/tokens_consumed_B": 21.959278592, "train/loss_slope": 7.852962529948969e-06} {"step": 10480, "timestamp": 1778337036.7594488, "train/loss": 2.378375792503357, "train/z_loss": 0.001354802749119699, "train/perplexity": 10.787367703923216, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1965318.4884259917, "perf/iters_per_sec": 0.9371368829851111, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0670799732208252, "data/tokens_consumed": 21980250112, "data/tokens_consumed_B": 21.980250112, "train/loss_slope": 9.957177181913423e-06} {"step": 10490, "timestamp": 1778337047.1198435, "train/loss": 2.317117691040039, "train/z_loss": 0.0013677831157110631, "train/perplexity": 10.146387096516342, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025354.6629309906, "perf/iters_per_sec": 0.9657643618254617, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354492664337158, "data/tokens_consumed": 22001221632, "data/tokens_consumed_B": 22.001221632, "train/loss_slope": 5.417191756941185e-06} {"step": 10500, "timestamp": 1778337057.4589963, "grad/layer_0/attn": 0.003182802116498351, "grad/layer_0/mlp": 0.0034033211413770914, "grad/layer_0/attn_mlp_ratio": 0.93520472819383, "grad/layer_4/attn": 0.0017505278810858727, "grad/layer_4/mlp": 0.00266199535690248, "grad/layer_4/attn_mlp_ratio": 0.6575998755169933, "grad/layer_8/attn": 0.003687390359118581, "grad/layer_8/mlp": 0.0034419053699821234, "grad/layer_8/attn_mlp_ratio": 1.0713223797914857, "grad/layer_12/attn": 0.009836177341639996, "grad/layer_12/mlp": 0.006953307893127203, "grad/layer_12/attn_mlp_ratio": 1.414604005943974, "grad/layer_16/attn": 0.003701757872477174, "grad/layer_16/mlp": 0.004232738167047501, "grad/layer_16/attn_mlp_ratio": 0.8745539267797189, "grad/layer_20/attn": 0.0029418140184134245, "grad/layer_20/mlp": 0.006560163106769323, "grad/layer_20/attn_mlp_ratio": 0.4484361022265141, "grad/layer_24/attn": 0.013914928771555424, "grad/layer_24/mlp": 0.013544457964599133, "grad/layer_24/attn_mlp_ratio": 1.027352198603249, "grad/layer_27/attn": 0.00629319716244936, "grad/layer_27/mlp": 0.015044753439724445, "grad/layer_27/attn_mlp_ratio": 0.41829845506160573} {"step": 10500, "timestamp": 1778337058.0606976, "eos/sharpness": 68.62645149230956, "eos/L0_probe": 2.323936939239502, "eos/L_plus": 2.64219069480896, "eos/L_minus": 2.6919476985931396, "eos/grad_norm": 0.2528258264064789, "eos/embed_grad_frac": 0.03684094175696373, "eos/time_s": 0.5988988876342773} {"step": 10500, "timestamp": 1778337058.0904658, "train/loss": 2.3571130275726317, "train/z_loss": 0.0013640684075653553, "train/perplexity": 10.560419760892637, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912518.6813675382, "perf/iters_per_sec": 0.9119599730336848, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.096539354324341, "data/tokens_consumed": 22022193152, "data/tokens_consumed_B": 22.022193152, "train/loss_slope": 6.049049190311775e-06} {"step": 10500, "timestamp": 1778337059.456146, "geo/rankme_last": 429.29376220703125, "geo/layer_0/stable_rank_q_proj": 20.729978561401367, "geo/layer_0/stable_rank_k_proj": 17.175689697265625, "geo/layer_0/stable_rank_o_proj": 44.33033752441406, "geo/layer_0/stable_rank_gate_proj": 126.59403228759766, "geo/layer_0/stable_rank_down_proj": 57.44552993774414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06646072864532471, "geo/layer_0/attn_entropy_mean": 6.2408528327941895, "geo/layer_0/attn_entropy_std": 0.44818273186683655, "geo/layer_7/stable_rank_q_proj": 42.199005126953125, "geo/layer_7/stable_rank_k_proj": 38.89438247680664, "geo/layer_7/stable_rank_o_proj": 88.743408203125, "geo/layer_7/stable_rank_gate_proj": 78.55488586425781, "geo/layer_7/stable_rank_down_proj": 143.95164489746094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4045105576515198, "geo/layer_7/attn_entropy_mean": 4.713836669921875, "geo/layer_7/attn_entropy_std": 0.7618723511695862, "geo/layer_14/stable_rank_q_proj": 51.38607406616211, "geo/layer_14/stable_rank_k_proj": 42.82685852050781, "geo/layer_14/stable_rank_o_proj": 42.51828384399414, "geo/layer_14/stable_rank_gate_proj": 71.81449890136719, "geo/layer_14/stable_rank_down_proj": 126.76245880126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3768463432788849, "geo/layer_14/attn_entropy_mean": 5.531454563140869, "geo/layer_14/attn_entropy_std": 0.4722539782524109, "geo/layer_21/stable_rank_q_proj": 38.90353012084961, "geo/layer_21/stable_rank_k_proj": 28.623918533325195, "geo/layer_21/stable_rank_o_proj": 65.53793334960938, "geo/layer_21/stable_rank_gate_proj": 60.7812614440918, "geo/layer_21/stable_rank_down_proj": 49.33224105834961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1360393464565277, "geo/layer_21/attn_entropy_mean": 5.863507270812988, "geo/layer_21/attn_entropy_std": 0.326966255903244, "geo/layer_27/stable_rank_q_proj": 44.094398498535156, "geo/layer_27/stable_rank_k_proj": 30.34611701965332, "geo/layer_27/stable_rank_o_proj": 107.30281066894531, "geo/layer_27/stable_rank_gate_proj": 70.73577117919922, "geo/layer_27/stable_rank_down_proj": 129.60430908203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09614066779613495, "geo/layer_27/attn_entropy_mean": 4.329221725463867, "geo/layer_27/attn_entropy_std": 0.6766064167022705, "attnres/final_alpha/block_0": 0.25864529609680176, "attnres/block_norm/0": 1.7807050943374634, "attnres/final_alpha/block_1": 0.0038459012284874916, "attnres/block_norm/1": 50518.1171875, "attnres/final_alpha/block_2": 0.008478019386529922, "attnres/block_norm/2": 29883.5625, "attnres/final_alpha/block_3": 0.01058906689286232, "attnres/block_norm/3": 71707.34375, "attnres/final_alpha/block_4": 0.011982657015323639, "attnres/block_norm/4": 17323.84765625, "attnres/final_alpha/block_5": 0.6056969165802002, "attnres/block_norm/5": 7222.2978515625, "attnres/final_alpha/block_6": 0.10076213628053665, "attnres/block_norm/6": 47834.421875, "geo/tier1_time_s": 1.3616406917572021, "geo/step": 10500.0, "geo/rankme_slope": 0.00040457221951280514} {"step": 10500, "timestamp": 1778337066.3408604, "geo/ww_alpha_mean": 7.831402967427435, "geo/ww_alpha_std": 4.885203037820262, "geo/ww_alpha_min": 1.3517839477488662, "geo/ww_alpha_max": 32.74644361949656, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.862436318584372, "geo/ww_alpha_by_type/k_proj": 4.647511971838751, "geo/ww_alpha_by_type/v_proj": 9.186684279548388, "geo/ww_alpha_by_type/o_proj": 9.217795920848994, "geo/ww_alpha_by_type/gate_proj": 7.693369919332661, "geo/ww_alpha_by_type/up_proj": 12.404715030412843, "geo/ww_alpha_by_type/down_proj": 7.9088894617398156, "geo/twonn_id/layer_0": 0.7540019750595093, "geo/twonn_id/layer_7": 3.94020938873291, "geo/twonn_id/layer_14": 5.305284023284912, "geo/twonn_id/layer_21": 7.985598087310791, "geo/twonn_id/layer_27": 6.184937477111816, "geo/tier2_time_s": 6.87766170501709} {"step": 10500, "timestamp": 1778337067.103563, "eoc/jacobian_sigma/layer_0/attn": 1535.0860595703125, "eoc/jacobian_sigma/layer_0/mlp": 10277.0947265625, "eoc/jacobian_sigma/layer_0": 10277.0947265625, "eoc/jacobian_sigma/layer_7/attn": 1.1279765367507935, "eoc/jacobian_sigma/layer_7/mlp": 1.7683558464050293, "eoc/jacobian_sigma/layer_7": 1.7683558464050293, "eoc/jacobian_sigma/layer_14/attn": 2.086270570755005, "eoc/jacobian_sigma/layer_14/mlp": 16.15773582458496, "eoc/jacobian_sigma/layer_14": 16.15773582458496, "eoc/jacobian_sigma/layer_21/attn": 1.0961872339248657, "eoc/jacobian_sigma/layer_21/mlp": 6.095324516296387, "eoc/jacobian_sigma/layer_21": 6.095324516296387, "eoc/jacobian_sigma/layer_27/attn": 3.7263741493225098, "eoc/jacobian_sigma/layer_27/mlp": 28.746488571166992, "eoc/jacobian_sigma/layer_27": 28.746488571166992, "eoc/layer0_sigma": 10277.0947265625, "eoc/sigma_max": 28.746488571166992, "eoc/sigma_min": 1.7683558464050293, "eoc/sigma_mean": 13.191976189613342, "eoc/time_s": 0.7548694610595703} {"step": 10510, "timestamp": 1778337077.4754722, "train/loss": 2.3355730295181276, "train/z_loss": 0.0013572348281741143, "train/perplexity": 10.3353807110447, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1082059.1498510106, "perf/iters_per_sec": 0.5159660100226453, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.938112163543701, "data/tokens_consumed": 22043164672, "data/tokens_consumed_B": 22.043164672, "train/loss_slope": 5.494394010514624e-06} {"step": 10520, "timestamp": 1778337087.8259616, "train/loss": 2.306557631492615, "train/z_loss": 0.0013779398635961116, "train/perplexity": 10.03980439488495, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027191.6495632054, "perf/iters_per_sec": 0.9666403053108241, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345109701156616, "data/tokens_consumed": 22064136192, "data/tokens_consumed_B": 22.064136192, "train/loss_slope": 4.218524846449775e-06} {"step": 10530, "timestamp": 1778337098.1777556, "train/loss": 2.3598061084747313, "train/z_loss": 0.0013571668532676995, "train/perplexity": 10.588898155767465, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027186.5104120946, "perf/iters_per_sec": 0.9666378547726129, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345135927200317, "data/tokens_consumed": 22085107712, "data/tokens_consumed_B": 22.085107712, "train/loss_slope": 3.539994340239055e-06} {"step": 10540, "timestamp": 1778337108.533174, "train/loss": 2.3187935590744018, "train/z_loss": 0.0013697191723622381, "train/perplexity": 10.163405358513275, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026431.478590964, "perf/iters_per_sec": 0.9662778275446721, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348990440368653, "data/tokens_consumed": 22106079232, "data/tokens_consumed_B": 22.106079232, "train/loss_slope": 2.067547848801879e-06} {"step": 10550, "timestamp": 1778337118.8888252, "grad/layer_0/attn": 0.002670924412086606, "grad/layer_0/mlp": 0.003141392720863223, "grad/layer_0/attn_mlp_ratio": 0.8502357280337398, "grad/layer_4/attn": 0.0019012772245332599, "grad/layer_4/mlp": 0.0025514867156744003, "grad/layer_4/attn_mlp_ratio": 0.7451644323040411, "grad/layer_8/attn": 0.004877563565969467, "grad/layer_8/mlp": 0.0033048377372324467, "grad/layer_8/attn_mlp_ratio": 1.4758859000640285, "grad/layer_12/attn": 0.00922349188476801, "grad/layer_12/mlp": 0.006478149443864822, "grad/layer_12/attn_mlp_ratio": 1.4237849593180791, "grad/layer_16/attn": 0.004296079743653536, "grad/layer_16/mlp": 0.004004885908216238, "grad/layer_16/attn_mlp_ratio": 1.0727096189104754, "grad/layer_20/attn": 0.0024453382939100266, "grad/layer_20/mlp": 0.005404943134635687, "grad/layer_20/attn_mlp_ratio": 0.45242626753967224, "grad/layer_24/attn": 0.007220403756946325, "grad/layer_24/mlp": 0.008537717163562775, "grad/layer_24/attn_mlp_ratio": 0.8457065904210166, "grad/layer_27/attn": 0.004234911408275366, "grad/layer_27/mlp": 0.007208473049104214, "grad/layer_27/attn_mlp_ratio": 0.587490765475298} {"step": 10550, "timestamp": 1778337118.9045947, "train/loss": 2.3279044389724732, "train/z_loss": 0.0013637069147080182, "train/perplexity": 10.256426030721235, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023083.0558599387, "perf/iters_per_sec": 0.9646811751651472, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366119146347046, "data/tokens_consumed": 22127050752, "data/tokens_consumed_B": 22.127050752, "train/loss_slope": 1.4773952971697696e-06} {"step": 10560, "timestamp": 1778337129.260556, "train/loss": 2.3241106271743774, "train/z_loss": 0.0013642649166285992, "train/perplexity": 10.217588797797863, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026544.1348947142, "perf/iters_per_sec": 0.9663315462564059, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348415136337281, "data/tokens_consumed": 22148022272, "data/tokens_consumed_B": 22.148022272, "train/loss_slope": 1.468115084671288e-06} {"step": 10570, "timestamp": 1778337139.620616, "train/loss": 2.324010944366455, "train/z_loss": 0.0013526457478292286, "train/perplexity": 10.21657033061898, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025595.4682704166, "perf/iters_per_sec": 0.9658791867591937, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353261709213257, "data/tokens_consumed": 22168993792, "data/tokens_consumed_B": 22.168993792, "train/loss_slope": 1.886585944055801e-06} {"step": 10575, "timestamp": 1778337145.4050248, "eos/sharpness": 69.99344825744627, "eos/L0_probe": 2.3169522285461426, "eos/L_plus": 2.6210553646087646, "eos/L_minus": 2.7127835750579834, "eos/grad_norm": 0.24262937903404236, "eos/embed_grad_frac": 0.04387669637799263, "eos/time_s": 0.6185135841369629} {"step": 10575, "timestamp": 1778337146.7871187, "geo/rankme_last": 428.9563293457031, "geo/layer_0/stable_rank_q_proj": 20.746366500854492, "geo/layer_0/stable_rank_k_proj": 17.180734634399414, "geo/layer_0/stable_rank_o_proj": 44.319393157958984, "geo/layer_0/stable_rank_gate_proj": 126.3212890625, "geo/layer_0/stable_rank_down_proj": 57.44972229003906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06820788979530334, "geo/layer_0/attn_entropy_mean": 6.234988212585449, "geo/layer_0/attn_entropy_std": 0.44948312640190125, "geo/layer_7/stable_rank_q_proj": 42.19011306762695, "geo/layer_7/stable_rank_k_proj": 38.83536911010742, "geo/layer_7/stable_rank_o_proj": 88.74262237548828, "geo/layer_7/stable_rank_gate_proj": 78.59082794189453, "geo/layer_7/stable_rank_down_proj": 143.802734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40782010555267334, "geo/layer_7/attn_entropy_mean": 4.733151435852051, "geo/layer_7/attn_entropy_std": 0.7674245238304138, "geo/layer_14/stable_rank_q_proj": 51.434383392333984, "geo/layer_14/stable_rank_k_proj": 42.78926467895508, "geo/layer_14/stable_rank_o_proj": 42.56590270996094, "geo/layer_14/stable_rank_gate_proj": 71.81470489501953, "geo/layer_14/stable_rank_down_proj": 126.86834716796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36374735832214355, "geo/layer_14/attn_entropy_mean": 5.499170303344727, "geo/layer_14/attn_entropy_std": 0.4883006513118744, "geo/layer_21/stable_rank_q_proj": 38.91544723510742, "geo/layer_21/stable_rank_k_proj": 28.639060974121094, "geo/layer_21/stable_rank_o_proj": 65.61186218261719, "geo/layer_21/stable_rank_gate_proj": 60.814117431640625, "geo/layer_21/stable_rank_down_proj": 49.21404266357422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13406306505203247, "geo/layer_21/attn_entropy_mean": 5.857151031494141, "geo/layer_21/attn_entropy_std": 0.3330429792404175, "geo/layer_27/stable_rank_q_proj": 44.087398529052734, "geo/layer_27/stable_rank_k_proj": 30.30919075012207, "geo/layer_27/stable_rank_o_proj": 107.26569366455078, "geo/layer_27/stable_rank_gate_proj": 70.67236328125, "geo/layer_27/stable_rank_down_proj": 129.9818878173828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10650581866502762, "geo/layer_27/attn_entropy_mean": 4.325531005859375, "geo/layer_27/attn_entropy_std": 0.6833401322364807, "attnres/final_alpha/block_0": 0.2592301070690155, "attnres/block_norm/0": 1.7808700799942017, "attnres/final_alpha/block_1": 0.003910796716809273, "attnres/block_norm/1": 50586.66015625, "attnres/final_alpha/block_2": 0.008453678339719772, "attnres/block_norm/2": 29975.22265625, "attnres/final_alpha/block_3": 0.010510069318115711, "attnres/block_norm/3": 71727.1875, "attnres/final_alpha/block_4": 0.012058543041348457, "attnres/block_norm/4": 17348.037109375, "attnres/final_alpha/block_5": 0.6053524017333984, "attnres/block_norm/5": 7206.6162109375, "attnres/final_alpha/block_6": 0.10048436373472214, "attnres/block_norm/6": 48326.4375, "geo/tier1_time_s": 1.3621058464050293, "geo/step": 10575.0, "geo/rankme_slope": 0.0003932175995398159} {"step": 10580, "timestamp": 1778337151.9667995, "train/loss": 2.2842938184738157, "train/z_loss": 0.001371324306819588, "train/perplexity": 9.818749955344536, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699585.7162978577, "perf/iters_per_sec": 0.8104256230820931, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2339195251464843, "data/tokens_consumed": 22189965312, "data/tokens_consumed_B": 22.189965312, "train/loss_slope": -1.5011656223529842e-06} {"step": 10590, "timestamp": 1778337162.3193877, "train/loss": 2.3205215454101564, "train/z_loss": 0.001369017013348639, "train/perplexity": 10.180982766484165, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026639.0597710733, "perf/iters_per_sec": 0.9663768099646918, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347930431365966, "data/tokens_consumed": 22210936832, "data/tokens_consumed_B": 22.210936832, "train/loss_slope": -5.27999210100169e-07} {"step": 10600, "timestamp": 1778337172.6643941, "grad/layer_0/attn": 0.002960967132821679, "grad/layer_0/mlp": 0.0032904332038015127, "grad/layer_0/attn_mlp_ratio": 0.8998714939460424, "grad/layer_4/attn": 0.0017130182823166251, "grad/layer_4/mlp": 0.00261570792645216, "grad/layer_4/attn_mlp_ratio": 0.65489659587125, "grad/layer_8/attn": 0.0030176255386322737, "grad/layer_8/mlp": 0.003490670118480921, "grad/layer_8/attn_mlp_ratio": 0.8644831364062492, "grad/layer_12/attn": 0.006845348980277777, "grad/layer_12/mlp": 0.006639366503804922, "grad/layer_12/attn_mlp_ratio": 1.0310244016883794, "grad/layer_16/attn": 0.003647320903837681, "grad/layer_16/mlp": 0.0046460554003715515, "grad/layer_16/attn_mlp_ratio": 0.7850360167987661, "grad/layer_20/attn": 0.00245170621201396, "grad/layer_20/mlp": 0.0054524121806025505, "grad/layer_20/attn_mlp_ratio": 0.44965532425640786, "grad/layer_24/attn": 0.004217171110212803, "grad/layer_24/mlp": 0.007331532426178455, "grad/layer_24/attn_mlp_ratio": 0.5752100389863501, "grad/layer_27/attn": 0.006758918985724449, "grad/layer_27/mlp": 0.006841992028057575, "grad/layer_27/attn_mlp_ratio": 0.9878583399720002} {"step": 10600, "timestamp": 1778337172.6801462, "train/loss": 2.3694827795028686, "train/z_loss": 0.0013678570510819554, "train/perplexity": 10.691860804176569, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025473.7291876774, "perf/iters_per_sec": 0.9658211370409381, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353883981704712, "data/tokens_consumed": 22231908352, "data/tokens_consumed_B": 22.231908352, "train/loss_slope": 1.655264255559122e-06} {"step": 10610, "timestamp": 1778337183.0354495, "train/loss": 2.336041808128357, "train/z_loss": 0.0013733787694945932, "train/perplexity": 10.340226852245536, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026595.3549358987, "perf/iters_per_sec": 0.9663559698752874, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348153591156006, "data/tokens_consumed": 22252879872, "data/tokens_consumed_B": 22.252879872, "train/loss_slope": 1.9308558892387934e-06} {"step": 10620, "timestamp": 1778337193.3857987, "train/loss": 2.3338225841522218, "train/z_loss": 0.0013528959243558347, "train/perplexity": 10.31730501664734, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027067.3828483936, "perf/iters_per_sec": 0.966581050323674, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345743894577026, "data/tokens_consumed": 22273851392, "data/tokens_consumed_B": 22.273851392, "train/loss_slope": 2.931878576041163e-06} {"step": 10630, "timestamp": 1778337204.200897, "train/loss": 2.3525234937667845, "train/z_loss": 0.0013650661916472019, "train/perplexity": 10.512063408833098, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1940098.1023319464, "perf/iters_per_sec": 0.9251108657512409, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0809515237808227, "data/tokens_consumed": 22294822912, "data/tokens_consumed_B": 22.294822912, "train/loss_slope": 1.5737298798925473e-06} {"step": 10640, "timestamp": 1778337214.5625982, "train/loss": 2.345703053474426, "train/z_loss": 0.0013558371108956635, "train/perplexity": 10.440610455294904, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025654.0108556957, "perf/iters_per_sec": 0.9659071020391921, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352962493896485, "data/tokens_consumed": 22315794432, "data/tokens_consumed_B": 22.315794432, "train/loss_slope": 4.74465304654239e-06} {"step": 10650, "timestamp": 1778337224.9399056, "grad/layer_0/attn": 0.0037834469694644213, "grad/layer_0/mlp": 0.003644632175564766, "grad/layer_0/attn_mlp_ratio": 1.0380874347270452, "grad/layer_4/attn": 0.0020976820960640907, "grad/layer_4/mlp": 0.002611266914755106, "grad/layer_4/attn_mlp_ratio": 0.8033196468270093, "grad/layer_8/attn": 0.00892139133065939, "grad/layer_8/mlp": 0.0035484021063894033, "grad/layer_8/attn_mlp_ratio": 2.514199578220066, "grad/layer_12/attn": 0.006834890693426132, "grad/layer_12/mlp": 0.007334682159125805, "grad/layer_12/attn_mlp_ratio": 0.9318591388089331, "grad/layer_16/attn": 0.004244772717356682, "grad/layer_16/mlp": 0.005386720411479473, "grad/layer_16/attn_mlp_ratio": 0.7880068602614111, "grad/layer_20/attn": 0.004007193259894848, "grad/layer_20/mlp": 0.007283068727701902, "grad/layer_20/attn_mlp_ratio": 0.5502066992217726, "grad/layer_24/attn": 0.015111817046999931, "grad/layer_24/mlp": 0.011834581382572651, "grad/layer_24/attn_mlp_ratio": 1.2769202754869926, "grad/layer_27/attn": 0.0055327462032437325, "grad/layer_27/mlp": 0.012949006631970406, "grad/layer_27/attn_mlp_ratio": 0.42727186090525987} {"step": 10650, "timestamp": 1778337225.5337694, "eos/sharpness": 57.40146636962889, "eos/L0_probe": 2.319399356842041, "eos/L_plus": 2.660038471221924, "eos/L_minus": 2.5527749061584473, "eos/grad_norm": 0.22871287167072296, "eos/embed_grad_frac": 0.04097408428788185, "eos/time_s": 0.5910739898681641} {"step": 10650, "timestamp": 1778337225.554943, "train/loss": 2.3339634418487547, "train/z_loss": 0.0013699657749384642, "train/perplexity": 10.31875839082348, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909185.6736812508, "perf/iters_per_sec": 0.9103706711202864, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.098453664779663, "data/tokens_consumed": 22336765952, "data/tokens_consumed_B": 22.336765952, "train/loss_slope": 3.9266518919881504e-06} {"step": 10650, "timestamp": 1778337226.9205058, "geo/rankme_last": 429.35455322265625, "geo/layer_0/stable_rank_q_proj": 20.744386672973633, "geo/layer_0/stable_rank_k_proj": 17.1341552734375, "geo/layer_0/stable_rank_o_proj": 44.32631301879883, "geo/layer_0/stable_rank_gate_proj": 126.18347930908203, "geo/layer_0/stable_rank_down_proj": 57.3908805847168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06798843294382095, "geo/layer_0/attn_entropy_mean": 6.234581470489502, "geo/layer_0/attn_entropy_std": 0.44923925399780273, "geo/layer_7/stable_rank_q_proj": 42.17123031616211, "geo/layer_7/stable_rank_k_proj": 38.78193283081055, "geo/layer_7/stable_rank_o_proj": 88.7271499633789, "geo/layer_7/stable_rank_gate_proj": 78.6877212524414, "geo/layer_7/stable_rank_down_proj": 144.13304138183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4021606743335724, "geo/layer_7/attn_entropy_mean": 4.738842964172363, "geo/layer_7/attn_entropy_std": 0.7713725566864014, "geo/layer_14/stable_rank_q_proj": 51.37309646606445, "geo/layer_14/stable_rank_k_proj": 42.71774673461914, "geo/layer_14/stable_rank_o_proj": 42.602325439453125, "geo/layer_14/stable_rank_gate_proj": 71.9073486328125, "geo/layer_14/stable_rank_down_proj": 127.1992416381836, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3722810745239258, "geo/layer_14/attn_entropy_mean": 5.501186847686768, "geo/layer_14/attn_entropy_std": 0.4682054817676544, "geo/layer_21/stable_rank_q_proj": 38.895755767822266, "geo/layer_21/stable_rank_k_proj": 28.573898315429688, "geo/layer_21/stable_rank_o_proj": 65.63223266601562, "geo/layer_21/stable_rank_gate_proj": 60.68916320800781, "geo/layer_21/stable_rank_down_proj": 49.226776123046875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1358923465013504, "geo/layer_21/attn_entropy_mean": 5.844211578369141, "geo/layer_21/attn_entropy_std": 0.3389766216278076, "geo/layer_27/stable_rank_q_proj": 44.101837158203125, "geo/layer_27/stable_rank_k_proj": 30.337474822998047, "geo/layer_27/stable_rank_o_proj": 107.22113800048828, "geo/layer_27/stable_rank_gate_proj": 70.66807556152344, "geo/layer_27/stable_rank_down_proj": 129.9121856689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09510555118322372, "geo/layer_27/attn_entropy_mean": 4.311182975769043, "geo/layer_27/attn_entropy_std": 0.6817507743835449, "attnres/final_alpha/block_0": 0.2572662830352783, "attnres/block_norm/0": 1.7808583974838257, "attnres/final_alpha/block_1": 0.0038147643208503723, "attnres/block_norm/1": 50748.3671875, "attnres/final_alpha/block_2": 0.00833270326256752, "attnres/block_norm/2": 30004.98828125, "attnres/final_alpha/block_3": 0.010449051856994629, "attnres/block_norm/3": 71970.0, "attnres/final_alpha/block_4": 0.011873711831867695, "attnres/block_norm/4": 17328.412109375, "attnres/final_alpha/block_5": 0.6109603643417358, "attnres/block_norm/5": 7149.23828125, "attnres/final_alpha/block_6": 0.09730309993028641, "attnres/block_norm/6": 48379.515625, "geo/tier1_time_s": 1.3612313270568848, "geo/step": 10650.0, "geo/rankme_slope": 0.00037910660357893156} {"step": 10660, "timestamp": 1778337237.30271, "train/loss": 2.330239415168762, "train/z_loss": 0.0013523476547561586, "train/perplexity": 10.280402522737486, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785782.5484374182, "perf/iters_per_sec": 0.8515274755656329, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1743602275848388, "data/tokens_consumed": 22357737472, "data/tokens_consumed_B": 22.357737472, "train/loss_slope": 4.116402278960753e-06} {"step": 10670, "timestamp": 1778337247.6868703, "train/loss": 2.3506721019744874, "train/z_loss": 0.0013515531201846898, "train/perplexity": 10.49261946565021, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020848.2483325135, "perf/iters_per_sec": 0.9636155358946388, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377582788467408, "data/tokens_consumed": 22378708992, "data/tokens_consumed_B": 22.378708992, "train/loss_slope": 4.507215123901867e-06} {"step": 10680, "timestamp": 1778337258.5797625, "train/loss": 2.3890968561172485, "train/z_loss": 0.001351320103276521, "train/perplexity": 10.90364193730689, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926373.7958748224, "perf/iters_per_sec": 0.9185666064619171, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0886526823043823, "data/tokens_consumed": 22399680512, "data/tokens_consumed_B": 22.399680512, "train/loss_slope": 4.876136007231665e-06} {"step": 10690, "timestamp": 1778337268.9579988, "train/loss": 2.3504290103912355, "train/z_loss": 0.001365005725529045, "train/perplexity": 10.490069108169623, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021612.877502909, "perf/iters_per_sec": 0.9639801394953293, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037365770339966, "data/tokens_consumed": 22420652032, "data/tokens_consumed_B": 22.420652032, "train/loss_slope": 3.9461133289079374e-06} {"step": 10700, "timestamp": 1778337279.3331711, "grad/layer_0/attn": 0.002971827983856201, "grad/layer_0/mlp": 0.0029759493190795183, "grad/layer_0/attn_mlp_ratio": 0.9986150855935609, "grad/layer_4/attn": 0.0017988682957366109, "grad/layer_4/mlp": 0.0025817700661718845, "grad/layer_4/attn_mlp_ratio": 0.6967577204611826, "grad/layer_8/attn": 0.007161812391132116, "grad/layer_8/mlp": 0.0035130109172314405, "grad/layer_8/attn_mlp_ratio": 2.038653552750953, "grad/layer_12/attn": 0.008073734119534492, "grad/layer_12/mlp": 0.007114718202501535, "grad/layer_12/attn_mlp_ratio": 1.1347932238857255, "grad/layer_16/attn": 0.0033487507607787848, "grad/layer_16/mlp": 0.004216711036860943, "grad/layer_16/attn_mlp_ratio": 0.7941617654349699, "grad/layer_20/attn": 0.003081614151597023, "grad/layer_20/mlp": 0.005159599240869284, "grad/layer_20/attn_mlp_ratio": 0.5972584202783925, "grad/layer_24/attn": 0.0061335875652730465, "grad/layer_24/mlp": 0.008240479975938797, "grad/layer_24/attn_mlp_ratio": 0.7443240574274768, "grad/layer_27/attn": 0.006862991489470005, "grad/layer_27/mlp": 0.008095841854810715, "grad/layer_27/attn_mlp_ratio": 0.8477180666047804} {"step": 10700, "timestamp": 1778337279.3486323, "train/loss": 2.3226174116134644, "train/z_loss": 0.0013629550347104668, "train/perplexity": 10.20234312058397, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019539.4626736413, "perf/iters_per_sec": 0.9629914582603651, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0384308099746704, "data/tokens_consumed": 22441623552, "data/tokens_consumed_B": 22.441623552, "train/loss_slope": 3.587480711572105e-06} {"step": 10710, "timestamp": 1778337289.7390316, "train/loss": 2.283953332901001, "train/z_loss": 0.0013633686467073858, "train/perplexity": 9.815407381723004, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019774.0637502472, "perf/iters_per_sec": 0.9631033247710453, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383101940155028, "data/tokens_consumed": 22462595072, "data/tokens_consumed_B": 22.462595072, "train/loss_slope": -9.063656609801953e-07} {"step": 10720, "timestamp": 1778337300.116644, "train/loss": 2.330154037475586, "train/z_loss": 0.0013694358873181044, "train/perplexity": 10.279524843152833, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021983.347503771, "perf/iters_per_sec": 0.96415679335774, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037175703048706, "data/tokens_consumed": 22483566592, "data/tokens_consumed_B": 22.483566592, "train/loss_slope": -1.415542722141809e-06} {"step": 10725, "timestamp": 1778337305.8872614, "eos/sharpness": 55.876111984252915, "eos/L0_probe": 2.3187551498413086, "eos/L_plus": 2.5809874534606934, "eos/L_minus": 2.615283966064453, "eos/grad_norm": 0.19090838730335236, "eos/embed_grad_frac": 0.06502845138311386, "eos/time_s": 0.5900747776031494} {"step": 10725, "timestamp": 1778337307.2697804, "geo/rankme_last": 429.2188415527344, "geo/layer_0/stable_rank_q_proj": 20.741130828857422, "geo/layer_0/stable_rank_k_proj": 17.142452239990234, "geo/layer_0/stable_rank_o_proj": 44.34738540649414, "geo/layer_0/stable_rank_gate_proj": 126.0307846069336, "geo/layer_0/stable_rank_down_proj": 57.357521057128906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06298477947711945, "geo/layer_0/attn_entropy_mean": 6.235584735870361, "geo/layer_0/attn_entropy_std": 0.4431008994579315, "geo/layer_7/stable_rank_q_proj": 42.13765335083008, "geo/layer_7/stable_rank_k_proj": 38.80756759643555, "geo/layer_7/stable_rank_o_proj": 88.63109588623047, "geo/layer_7/stable_rank_gate_proj": 78.58776092529297, "geo/layer_7/stable_rank_down_proj": 144.1820526123047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4066203236579895, "geo/layer_7/attn_entropy_mean": 4.7401227951049805, "geo/layer_7/attn_entropy_std": 0.7743510603904724, "geo/layer_14/stable_rank_q_proj": 51.38100051879883, "geo/layer_14/stable_rank_k_proj": 42.9094123840332, "geo/layer_14/stable_rank_o_proj": 42.556636810302734, "geo/layer_14/stable_rank_gate_proj": 72.05290222167969, "geo/layer_14/stable_rank_down_proj": 127.58074188232422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36210212111473083, "geo/layer_14/attn_entropy_mean": 5.505061149597168, "geo/layer_14/attn_entropy_std": 0.4735482335090637, "geo/layer_21/stable_rank_q_proj": 38.818111419677734, "geo/layer_21/stable_rank_k_proj": 28.542837142944336, "geo/layer_21/stable_rank_o_proj": 65.63374328613281, "geo/layer_21/stable_rank_gate_proj": 60.61079788208008, "geo/layer_21/stable_rank_down_proj": 49.16119384765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1315014660358429, "geo/layer_21/attn_entropy_mean": 5.854222297668457, "geo/layer_21/attn_entropy_std": 0.33317556977272034, "geo/layer_27/stable_rank_q_proj": 44.167423248291016, "geo/layer_27/stable_rank_k_proj": 30.299060821533203, "geo/layer_27/stable_rank_o_proj": 107.2568130493164, "geo/layer_27/stable_rank_gate_proj": 70.6978759765625, "geo/layer_27/stable_rank_down_proj": 129.88389587402344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.092686228454113, "geo/layer_27/attn_entropy_mean": 4.306825637817383, "geo/layer_27/attn_entropy_std": 0.6955341696739197, "attnres/final_alpha/block_0": 0.25959497690200806, "attnres/block_norm/0": 1.7810630798339844, "attnres/final_alpha/block_1": 0.00387592613697052, "attnres/block_norm/1": 50439.484375, "attnres/final_alpha/block_2": 0.008449453860521317, "attnres/block_norm/2": 29851.56640625, "attnres/final_alpha/block_3": 0.01043626107275486, "attnres/block_norm/3": 71687.46875, "attnres/final_alpha/block_4": 0.011993156746029854, "attnres/block_norm/4": 17446.69140625, "attnres/final_alpha/block_5": 0.6044876575469971, "attnres/block_norm/5": 7253.15625, "attnres/final_alpha/block_6": 0.10116250813007355, "attnres/block_norm/6": 48205.9921875, "geo/tier1_time_s": 1.3630683422088623, "geo/step": 10725.0, "geo/rankme_slope": 0.0003949527858018207} {"step": 10730, "timestamp": 1778337312.462922, "train/loss": 2.313511300086975, "train/z_loss": 0.0013545973459258675, "train/perplexity": 10.109861160875292, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699284.4017769813, "perf/iters_per_sec": 0.8102819451222331, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2341383218765258, "data/tokens_consumed": 22504538112, "data/tokens_consumed_B": 22.504538112, "train/loss_slope": -3.6923667170645e-07} {"step": 10740, "timestamp": 1778337322.8480308, "train/loss": 2.3107738733291625, "train/z_loss": 0.0013607104774564505, "train/perplexity": 10.08222400102445, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020375.8172135071, "perf/iters_per_sec": 0.9633902631824051, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380009412765503, "data/tokens_consumed": 22525509632, "data/tokens_consumed_B": 22.525509632, "train/loss_slope": -7.399873049667927e-07} {"step": 10750, "timestamp": 1778337333.2112923, "grad/layer_0/attn": 0.003403201699256897, "grad/layer_0/mlp": 0.0035426814574748278, "grad/layer_0/attn_mlp_ratio": 0.9606287339251146, "grad/layer_4/attn": 0.0017590492498129606, "grad/layer_4/mlp": 0.0025800426956266165, "grad/layer_4/attn_mlp_ratio": 0.6817907256401688, "grad/layer_8/attn": 0.004101622384041548, "grad/layer_8/mlp": 0.0032788263633847237, "grad/layer_8/attn_mlp_ratio": 1.2509421983277071, "grad/layer_12/attn": 0.0052574267610907555, "grad/layer_12/mlp": 0.006543884519487619, "grad/layer_12/attn_mlp_ratio": 0.8034106752790529, "grad/layer_16/attn": 0.003278847085312009, "grad/layer_16/mlp": 0.004503561649471521, "grad/layer_16/attn_mlp_ratio": 0.7280564290468031, "grad/layer_20/attn": 0.0028102227952331305, "grad/layer_20/mlp": 0.005515911616384983, "grad/layer_20/attn_mlp_ratio": 0.5094756659874342, "grad/layer_24/attn": 0.00584441889077425, "grad/layer_24/mlp": 0.008173264563083649, "grad/layer_24/attn_mlp_ratio": 0.7150654153134006, "grad/layer_27/attn": 0.012022632174193859, "grad/layer_27/mlp": 0.0077480957843363285, "grad/layer_27/attn_mlp_ratio": 1.5516886153279286} {"step": 10750, "timestamp": 1778337333.2266994, "train/loss": 2.3681522607803345, "train/z_loss": 0.0013636331888847053, "train/perplexity": 10.677644542796079, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021870.2681921865, "perf/iters_per_sec": 0.9641028729401524, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372337102890015, "data/tokens_consumed": 22546481152, "data/tokens_consumed_B": 22.546481152, "train/loss_slope": -5.260368814134042e-07} {"step": 10760, "timestamp": 1778337343.607814, "train/loss": 2.3489001274108885, "train/z_loss": 0.0013728181598708033, "train/perplexity": 10.474043273981723, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021063.9282247738, "perf/iters_per_sec": 0.9637183800815458, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037647533416748, "data/tokens_consumed": 22567452672, "data/tokens_consumed_B": 22.567452672, "train/loss_slope": -2.09886706558594e-07} {"step": 10770, "timestamp": 1778337353.9894116, "train/loss": 2.329485368728638, "train/z_loss": 0.0013596313656307757, "train/perplexity": 10.272653543724353, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021283.9737930757, "perf/iters_per_sec": 0.96382330598501, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375345706939698, "data/tokens_consumed": 22588424192, "data/tokens_consumed_B": 22.588424192, "train/loss_slope": -3.0415183318263196e-07} {"step": 10780, "timestamp": 1778337364.3747218, "train/loss": 2.337232565879822, "train/z_loss": 0.0013605184271000327, "train/perplexity": 10.35254689115695, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020593.7634853853, "perf/iters_per_sec": 0.9634941880633284, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378889799118043, "data/tokens_consumed": 22609395712, "data/tokens_consumed_B": 22.609395712, "train/loss_slope": 2.1522306516082152e-06} {"step": 10790, "timestamp": 1778337374.7626796, "train/loss": 2.354925012588501, "train/z_loss": 0.0013602642575278878, "train/perplexity": 10.537338664317893, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019796.6966634074, "perf/iters_per_sec": 0.9631141169850385, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0382985591888427, "data/tokens_consumed": 22630367232, "data/tokens_consumed_B": 22.630367232, "train/loss_slope": 2.1107748325186086e-06} {"step": 10800, "timestamp": 1778337385.137856, "grad/layer_0/attn": 0.002857393119484186, "grad/layer_0/mlp": 0.003124458249658346, "grad/layer_0/attn_mlp_ratio": 0.9145243110047017, "grad/layer_4/attn": 0.0019269149051979184, "grad/layer_4/mlp": 0.00256421510130167, "grad/layer_4/attn_mlp_ratio": 0.7514638023437972, "grad/layer_8/attn": 0.003460861975327134, "grad/layer_8/mlp": 0.0036621838808059692, "grad/layer_8/attn_mlp_ratio": 0.945026790971182, "grad/layer_12/attn": 0.006213045679032803, "grad/layer_12/mlp": 0.007334393449127674, "grad/layer_12/attn_mlp_ratio": 0.8471110307097933, "grad/layer_16/attn": 0.003950428683310747, "grad/layer_16/mlp": 0.004378286190330982, "grad/layer_16/attn_mlp_ratio": 0.902277380087018, "grad/layer_20/attn": 0.0030535138212144375, "grad/layer_20/mlp": 0.006258932873606682, "grad/layer_20/attn_mlp_ratio": 0.48786491788469566, "grad/layer_24/attn": 0.009309140965342522, "grad/layer_24/mlp": 0.0098783690482378, "grad/layer_24/attn_mlp_ratio": 0.942376299735992, "grad/layer_27/attn": 0.006537224631756544, "grad/layer_27/mlp": 0.00844564288854599, "grad/layer_27/attn_mlp_ratio": 0.774035161162075} {"step": 10800, "timestamp": 1778337385.7310097, "eos/sharpness": 60.19184589385985, "eos/L0_probe": 2.31546688079834, "eos/L_plus": 2.5644772052764893, "eos/L_minus": 2.668375015258789, "eos/grad_norm": 0.15967325866222382, "eos/embed_grad_frac": 0.09393055737018585, "eos/time_s": 0.5904841423034668} {"step": 10800, "timestamp": 1778337385.7524347, "train/loss": 2.3363152503967286, "train/z_loss": 0.001361524756066501, "train/perplexity": 10.343054693939596, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909443.1672837958, "perf/iters_per_sec": 0.9104934536379794, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0983055353164672, "data/tokens_consumed": 22651338752, "data/tokens_consumed_B": 22.651338752, "train/loss_slope": 3.622085269134784e-06} {"step": 10800, "timestamp": 1778337387.1148179, "geo/rankme_last": 428.6318664550781, "geo/layer_0/stable_rank_q_proj": 20.71259117126465, "geo/layer_0/stable_rank_k_proj": 17.107784271240234, "geo/layer_0/stable_rank_o_proj": 44.304107666015625, "geo/layer_0/stable_rank_gate_proj": 126.0597915649414, "geo/layer_0/stable_rank_down_proj": 57.42854309082031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0634177103638649, "geo/layer_0/attn_entropy_mean": 6.23491907119751, "geo/layer_0/attn_entropy_std": 0.44877728819847107, "geo/layer_7/stable_rank_q_proj": 42.150325775146484, "geo/layer_7/stable_rank_k_proj": 38.70686721801758, "geo/layer_7/stable_rank_o_proj": 88.64273071289062, "geo/layer_7/stable_rank_gate_proj": 78.6006088256836, "geo/layer_7/stable_rank_down_proj": 144.4057159423828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3947768211364746, "geo/layer_7/attn_entropy_mean": 4.749180316925049, "geo/layer_7/attn_entropy_std": 0.7689241766929626, "geo/layer_14/stable_rank_q_proj": 51.35760498046875, "geo/layer_14/stable_rank_k_proj": 42.90414810180664, "geo/layer_14/stable_rank_o_proj": 42.599037170410156, "geo/layer_14/stable_rank_gate_proj": 72.05978393554688, "geo/layer_14/stable_rank_down_proj": 127.60771942138672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3618933856487274, "geo/layer_14/attn_entropy_mean": 5.535970687866211, "geo/layer_14/attn_entropy_std": 0.4586363434791565, "geo/layer_21/stable_rank_q_proj": 38.68122482299805, "geo/layer_21/stable_rank_k_proj": 28.51423454284668, "geo/layer_21/stable_rank_o_proj": 65.6247329711914, "geo/layer_21/stable_rank_gate_proj": 60.5421028137207, "geo/layer_21/stable_rank_down_proj": 49.04142379760742, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13426706194877625, "geo/layer_21/attn_entropy_mean": 5.86494779586792, "geo/layer_21/attn_entropy_std": 0.3247270882129669, "geo/layer_27/stable_rank_q_proj": 44.19371795654297, "geo/layer_27/stable_rank_k_proj": 30.297182083129883, "geo/layer_27/stable_rank_o_proj": 107.29450225830078, "geo/layer_27/stable_rank_gate_proj": 70.72820281982422, "geo/layer_27/stable_rank_down_proj": 129.68255615234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10191153734922409, "geo/layer_27/attn_entropy_mean": 4.319906711578369, "geo/layer_27/attn_entropy_std": 0.6752601861953735, "attnres/final_alpha/block_0": 0.259906142950058, "attnres/block_norm/0": 1.781164526939392, "attnres/final_alpha/block_1": 0.00388355809263885, "attnres/block_norm/1": 50435.51953125, "attnres/final_alpha/block_2": 0.008376382291316986, "attnres/block_norm/2": 29968.009765625, "attnres/final_alpha/block_3": 0.010401571169495583, "attnres/block_norm/3": 72279.28125, "attnres/final_alpha/block_4": 0.011941484175622463, "attnres/block_norm/4": 17420.49609375, "attnres/final_alpha/block_5": 0.6044430732727051, "attnres/block_norm/5": 7262.1572265625, "attnres/final_alpha/block_6": 0.10104779899120331, "attnres/block_norm/6": 48265.6015625, "geo/tier1_time_s": 1.35817551612854, "geo/step": 10800.0, "geo/rankme_slope": 0.00035618597048194277} {"step": 10810, "timestamp": 1778337397.491664, "train/loss": 2.304860162734985, "train/z_loss": 0.0013570212642662228, "train/perplexity": 10.022776596758243, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787055.3512006097, "perf/iters_per_sec": 0.8521343952181862, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1735238075256347, "data/tokens_consumed": 22672310272, "data/tokens_consumed_B": 22.672310272, "train/loss_slope": 6.923220159768791e-07} {"step": 10820, "timestamp": 1778337407.8678904, "train/loss": 2.313640308380127, "train/z_loss": 0.0013587607303634287, "train/perplexity": 10.111165500941196, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021917.3946541261, "perf/iters_per_sec": 0.9641253445883399, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372095346450805, "data/tokens_consumed": 22693281792, "data/tokens_consumed_B": 22.693281792, "train/loss_slope": -1.1896189647575473e-06} {"step": 10830, "timestamp": 1778337418.257617, "train/loss": 2.3023882627487184, "train/z_loss": 0.0013615581206977367, "train/perplexity": 9.998031891244747, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019770.8172674158, "perf/iters_per_sec": 0.9631017767273978, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383118629455566, "data/tokens_consumed": 22714253312, "data/tokens_consumed_B": 22.714253312, "train/loss_slope": -2.129389779760188e-06} {"step": 10840, "timestamp": 1778337428.6313274, "train/loss": 2.33890118598938, "train/z_loss": 0.0013654126902110874, "train/perplexity": 10.3698357793665, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022672.8797222336, "perf/iters_per_sec": 0.9644855879412811, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368221282958985, "data/tokens_consumed": 22735224832, "data/tokens_consumed_B": 22.735224832, "train/loss_slope": -2.2075152490148847e-06} {"step": 10850, "timestamp": 1778337438.9946158, "grad/layer_0/attn": 0.0027490952052176, "grad/layer_0/mlp": 0.00313829118385911, "grad/layer_0/attn_mlp_ratio": 0.8759847179759167, "grad/layer_4/attn": 0.0018850500928238034, "grad/layer_4/mlp": 0.002631494775414467, "grad/layer_4/attn_mlp_ratio": 0.7163419204937282, "grad/layer_8/attn": 0.006582156755030155, "grad/layer_8/mlp": 0.0035130574833601713, "grad/layer_8/attn_mlp_ratio": 1.8736262070416876, "grad/layer_12/attn": 0.00507982587441802, "grad/layer_12/mlp": 0.006706568878144026, "grad/layer_12/attn_mlp_ratio": 0.7574403381181372, "grad/layer_16/attn": 0.003872775938361883, "grad/layer_16/mlp": 0.004347756505012512, "grad/layer_16/attn_mlp_ratio": 0.8907527008059685, "grad/layer_20/attn": 0.0034439112059772015, "grad/layer_20/mlp": 0.0052461205050349236, "grad/layer_20/attn_mlp_ratio": 0.6564681724381126, "grad/layer_24/attn": 0.004380236379802227, "grad/layer_24/mlp": 0.007221646141260862, "grad/layer_24/attn_mlp_ratio": 0.606542640482132, "grad/layer_27/attn": 0.004184898920357227, "grad/layer_27/mlp": 0.006875261198729277, "grad/layer_27/attn_mlp_ratio": 0.6086894357208945} {"step": 10850, "timestamp": 1778337439.0102923, "train/loss": 2.2973803520202636, "train/z_loss": 0.0013713790220208466, "train/perplexity": 9.948087802222414, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021419.0993836646, "perf/iters_per_sec": 0.9638877388876269, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037465214729309, "data/tokens_consumed": 22756196352, "data/tokens_consumed_B": 22.756196352, "train/loss_slope": -3.1022012323150543e-06} {"step": 10860, "timestamp": 1778337449.3870547, "train/loss": 2.330481195449829, "train/z_loss": 0.0013567205634899438, "train/perplexity": 10.282888421857496, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022370.5063620997, "perf/iters_per_sec": 0.9643414050875185, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369771480560304, "data/tokens_consumed": 22777167872, "data/tokens_consumed_B": 22.777167872, "train/loss_slope": -4.190883341759822e-06} {"step": 10870, "timestamp": 1778337459.7806256, "train/loss": 2.3362496852874757, "train/z_loss": 0.001359211909584701, "train/perplexity": 10.342376572659369, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019654.8319209637, "perf/iters_per_sec": 0.9630464706044024, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03837149143219, "data/tokens_consumed": 22798139392, "data/tokens_consumed_B": 22.798139392, "train/loss_slope": -3.454357519759282e-06} {"step": 10875, "timestamp": 1778337465.5545924, "eos/sharpness": 36.59813404083251, "eos/L0_probe": 2.3140783309936523, "eos/L_plus": 2.5117180347442627, "eos/L_minus": 2.482419967651367, "eos/grad_norm": 0.14709553122520447, "eos/embed_grad_frac": 0.1176254153251648, "eos/time_s": 0.5924041271209717} {"step": 10875, "timestamp": 1778337466.9365404, "geo/rankme_last": 429.1400451660156, "geo/layer_0/stable_rank_q_proj": 20.74827766418457, "geo/layer_0/stable_rank_k_proj": 17.159513473510742, "geo/layer_0/stable_rank_o_proj": 44.27764129638672, "geo/layer_0/stable_rank_gate_proj": 126.21654510498047, "geo/layer_0/stable_rank_down_proj": 57.35203170776367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06212887540459633, "geo/layer_0/attn_entropy_mean": 6.240530014038086, "geo/layer_0/attn_entropy_std": 0.44785648584365845, "geo/layer_7/stable_rank_q_proj": 42.16331100463867, "geo/layer_7/stable_rank_k_proj": 38.85792922973633, "geo/layer_7/stable_rank_o_proj": 88.67350769042969, "geo/layer_7/stable_rank_gate_proj": 78.64366912841797, "geo/layer_7/stable_rank_down_proj": 144.69276428222656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4061683714389801, "geo/layer_7/attn_entropy_mean": 4.766292572021484, "geo/layer_7/attn_entropy_std": 0.7537776231765747, "geo/layer_14/stable_rank_q_proj": 51.4764404296875, "geo/layer_14/stable_rank_k_proj": 42.88302230834961, "geo/layer_14/stable_rank_o_proj": 42.579734802246094, "geo/layer_14/stable_rank_gate_proj": 71.94031524658203, "geo/layer_14/stable_rank_down_proj": 127.58065795898438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37307947874069214, "geo/layer_14/attn_entropy_mean": 5.528745651245117, "geo/layer_14/attn_entropy_std": 0.4775652289390564, "geo/layer_21/stable_rank_q_proj": 38.51421356201172, "geo/layer_21/stable_rank_k_proj": 28.63692283630371, "geo/layer_21/stable_rank_o_proj": 65.68689727783203, "geo/layer_21/stable_rank_gate_proj": 60.62904739379883, "geo/layer_21/stable_rank_down_proj": 49.04685592651367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1408611536026001, "geo/layer_21/attn_entropy_mean": 5.864865303039551, "geo/layer_21/attn_entropy_std": 0.3204675018787384, "geo/layer_27/stable_rank_q_proj": 44.22782516479492, "geo/layer_27/stable_rank_k_proj": 30.310577392578125, "geo/layer_27/stable_rank_o_proj": 107.52494812011719, "geo/layer_27/stable_rank_gate_proj": 70.67156982421875, "geo/layer_27/stable_rank_down_proj": 129.70066833496094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09885124117136002, "geo/layer_27/attn_entropy_mean": 4.316283702850342, "geo/layer_27/attn_entropy_std": 0.6599995493888855, "attnres/final_alpha/block_0": 0.2593143880367279, "attnres/block_norm/0": 1.7810418605804443, "attnres/final_alpha/block_1": 0.003854799084365368, "attnres/block_norm/1": 50666.69140625, "attnres/final_alpha/block_2": 0.00840030238032341, "attnres/block_norm/2": 29853.77734375, "attnres/final_alpha/block_3": 0.010351382195949554, "attnres/block_norm/3": 71839.734375, "attnres/final_alpha/block_4": 0.01187547855079174, "attnres/block_norm/4": 17400.11328125, "attnres/final_alpha/block_5": 0.6076247692108154, "attnres/block_norm/5": 7145.12890625, "attnres/final_alpha/block_6": 0.09857889264822006, "attnres/block_norm/6": 48337.56640625, "geo/tier1_time_s": 1.3604271411895752, "geo/step": 10875.0, "geo/rankme_slope": 0.0003899873621323529} {"step": 10880, "timestamp": 1778337472.125824, "train/loss": 2.33372049331665, "train/z_loss": 0.0013639615499414504, "train/perplexity": 10.31625176812177, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699334.9909868194, "perf/iters_per_sec": 0.810306067937288, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2341015815734864, "data/tokens_consumed": 22819110912, "data/tokens_consumed_B": 22.819110912, "train/loss_slope": -6.417367224431545e-06} {"step": 10890, "timestamp": 1778337482.5065465, "train/loss": 2.3228650808334352, "train/z_loss": 0.0013545898022130133, "train/perplexity": 10.204870239878435, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021674.1639067177, "perf/iters_per_sec": 0.964009363129958, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373343229293823, "data/tokens_consumed": 22840082432, "data/tokens_consumed_B": 22.840082432, "train/loss_slope": -6.645918822381559e-06} {"step": 10900, "timestamp": 1778337492.8770695, "grad/layer_0/attn": 0.002657313598319888, "grad/layer_0/mlp": 0.002784250071272254, "grad/layer_0/attn_mlp_ratio": 0.9544090634304019, "grad/layer_4/attn": 0.0018169240793213248, "grad/layer_4/mlp": 0.0026758613530546427, "grad/layer_4/attn_mlp_ratio": 0.679005289024664, "grad/layer_8/attn": 0.0029695676639676094, "grad/layer_8/mlp": 0.0034057984594255686, "grad/layer_8/attn_mlp_ratio": 0.8719152387181892, "grad/layer_12/attn": 0.006068455521017313, "grad/layer_12/mlp": 0.006700522731989622, "grad/layer_12/attn_mlp_ratio": 0.9056689564649053, "grad/layer_16/attn": 0.003875112859532237, "grad/layer_16/mlp": 0.00403905613347888, "grad/layer_16/attn_mlp_ratio": 0.9594104750045938, "grad/layer_20/attn": 0.002688140608370304, "grad/layer_20/mlp": 0.0054013775661587715, "grad/layer_20/attn_mlp_ratio": 0.4976768473740135, "grad/layer_24/attn": 0.01063835434615612, "grad/layer_24/mlp": 0.009812982752919197, "grad/layer_24/attn_mlp_ratio": 1.084110153416949, "grad/layer_27/attn": 0.004227110184729099, "grad/layer_27/mlp": 0.008906997740268707, "grad/layer_27/attn_mlp_ratio": 0.47458304813079144} {"step": 10900, "timestamp": 1778337492.892558, "train/loss": 2.3269136428833006, "train/z_loss": 0.0013609547400847077, "train/perplexity": 10.246269036507087, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020132.58603794, "perf/iters_per_sec": 0.9632742815198613, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381259202957154, "data/tokens_consumed": 22861053952, "data/tokens_consumed_B": 22.861053952, "train/loss_slope": -8.610162654868768e-06} {"step": 10910, "timestamp": 1778337503.2750735, "train/loss": 2.3687594890594483, "train/z_loss": 0.0013549232622608542, "train/perplexity": 10.684130279478865, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021104.8406277115, "perf/iters_per_sec": 0.9637378886354978, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037626528739929, "data/tokens_consumed": 22882025472, "data/tokens_consumed_B": 22.882025472, "train/loss_slope": -2.7379659858152616e-06} {"step": 10920, "timestamp": 1778337513.6556969, "train/loss": 2.3169103145599363, "train/z_loss": 0.0013605362153612077, "train/perplexity": 10.14428319263214, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021449.1555547507, "perf/iters_per_sec": 0.9639020707868341, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374497890472412, "data/tokens_consumed": 22902996992, "data/tokens_consumed_B": 22.902996992, "train/loss_slope": -6.564664897924498e-06} {"step": 10930, "timestamp": 1778337524.0367377, "train/loss": 2.340519332885742, "train/z_loss": 0.0013486970332451166, "train/perplexity": 10.386629280460165, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021042.8457489596, "perf/iters_per_sec": 0.9637083271736906, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376583576202392, "data/tokens_consumed": 22923968512, "data/tokens_consumed_B": 22.923968512, "train/loss_slope": -7.468305576895667e-06} {"step": 10940, "timestamp": 1778337534.4163373, "train/loss": 2.3156600475311278, "train/z_loss": 0.0013533983379602431, "train/perplexity": 10.13160805512986, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021709.3855229164, "perf/iters_per_sec": 0.9640261581053335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373162508010865, "data/tokens_consumed": 22944940032, "data/tokens_consumed_B": 22.944940032, "train/loss_slope": -8.902117185252249e-06} {"step": 10950, "timestamp": 1778337544.784672, "grad/layer_0/attn": 0.0031868345104157925, "grad/layer_0/mlp": 0.0034372678492218256, "grad/layer_0/attn_mlp_ratio": 0.9271417176357375, "grad/layer_4/attn": 0.0025549267884343863, "grad/layer_4/mlp": 0.0028081366326659918, "grad/layer_4/attn_mlp_ratio": 0.9098299091756786, "grad/layer_8/attn": 0.006407781504094601, "grad/layer_8/mlp": 0.0035934546031057835, "grad/layer_8/attn_mlp_ratio": 1.783181376561231, "grad/layer_12/attn": 0.00877210684120655, "grad/layer_12/mlp": 0.0070677390322089195, "grad/layer_12/attn_mlp_ratio": 1.2411475122547366, "grad/layer_16/attn": 0.0034602175001055002, "grad/layer_16/mlp": 0.004763732198625803, "grad/layer_16/attn_mlp_ratio": 0.7263669079607332, "grad/layer_20/attn": 0.006242085248231888, "grad/layer_20/mlp": 0.0065070465207099915, "grad/layer_20/attn_mlp_ratio": 0.9592808553676548, "grad/layer_24/attn": 0.012052939273416996, "grad/layer_24/mlp": 0.009873589500784874, "grad/layer_24/attn_mlp_ratio": 1.22072516285859, "grad/layer_27/attn": 0.005190251395106316, "grad/layer_27/mlp": 0.00925140455365181, "grad/layer_27/attn_mlp_ratio": 0.5610230650820756} {"step": 10950, "timestamp": 1778337545.3821442, "eos/sharpness": 35.916590690612786, "eos/L0_probe": 2.3160367012023926, "eos/L_plus": 2.5073964595794678, "eos/L_minus": 2.4838428497314453, "eos/grad_norm": 0.14626407623291016, "eos/embed_grad_frac": 0.1068049818277359, "eos/time_s": 0.5947573184967041} {"step": 10950, "timestamp": 1778337545.401544, "train/loss": 2.3368618965148924, "train/z_loss": 0.001356246042996645, "train/perplexity": 10.348710230285654, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910308.328714632, "perf/iters_per_sec": 0.9109059947560463, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0978081226348877, "data/tokens_consumed": 22965911552, "data/tokens_consumed_B": 22.965911552, "train/loss_slope": -4.686300052811172e-06} {"step": 10950, "timestamp": 1778337546.765975, "geo/rankme_last": 429.43829345703125, "geo/layer_0/stable_rank_q_proj": 20.752994537353516, "geo/layer_0/stable_rank_k_proj": 17.14434242248535, "geo/layer_0/stable_rank_o_proj": 44.25410461425781, "geo/layer_0/stable_rank_gate_proj": 126.2227554321289, "geo/layer_0/stable_rank_down_proj": 57.38742446899414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06572021543979645, "geo/layer_0/attn_entropy_mean": 6.2373127937316895, "geo/layer_0/attn_entropy_std": 0.4507392644882202, "geo/layer_7/stable_rank_q_proj": 42.21467971801758, "geo/layer_7/stable_rank_k_proj": 38.98354721069336, "geo/layer_7/stable_rank_o_proj": 88.80581665039062, "geo/layer_7/stable_rank_gate_proj": 78.61578369140625, "geo/layer_7/stable_rank_down_proj": 144.682373046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4065755307674408, "geo/layer_7/attn_entropy_mean": 4.733686447143555, "geo/layer_7/attn_entropy_std": 0.7555655241012573, "geo/layer_14/stable_rank_q_proj": 51.455753326416016, "geo/layer_14/stable_rank_k_proj": 42.752159118652344, "geo/layer_14/stable_rank_o_proj": 42.62356185913086, "geo/layer_14/stable_rank_gate_proj": 71.8244857788086, "geo/layer_14/stable_rank_down_proj": 127.49760437011719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37236708402633667, "geo/layer_14/attn_entropy_mean": 5.505145072937012, "geo/layer_14/attn_entropy_std": 0.4861365258693695, "geo/layer_21/stable_rank_q_proj": 38.479366302490234, "geo/layer_21/stable_rank_k_proj": 28.625375747680664, "geo/layer_21/stable_rank_o_proj": 65.64738464355469, "geo/layer_21/stable_rank_gate_proj": 60.674190521240234, "geo/layer_21/stable_rank_down_proj": 49.10317611694336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1395164281129837, "geo/layer_21/attn_entropy_mean": 5.835975170135498, "geo/layer_21/attn_entropy_std": 0.32856109738349915, "geo/layer_27/stable_rank_q_proj": 44.24336242675781, "geo/layer_27/stable_rank_k_proj": 30.295560836791992, "geo/layer_27/stable_rank_o_proj": 107.50877380371094, "geo/layer_27/stable_rank_gate_proj": 70.53604888916016, "geo/layer_27/stable_rank_down_proj": 129.9151153564453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1107085719704628, "geo/layer_27/attn_entropy_mean": 4.308716297149658, "geo/layer_27/attn_entropy_std": 0.674809455871582, "attnres/final_alpha/block_0": 0.2589617371559143, "attnres/block_norm/0": 1.7809728384017944, "attnres/final_alpha/block_1": 0.0038768588565289974, "attnres/block_norm/1": 50493.13671875, "attnres/final_alpha/block_2": 0.00835205428302288, "attnres/block_norm/2": 30042.44140625, "attnres/final_alpha/block_3": 0.010511834174394608, "attnres/block_norm/3": 71867.0234375, "attnres/final_alpha/block_4": 0.01191643811762333, "attnres/block_norm/4": 17380.15625, "attnres/final_alpha/block_5": 0.6074056625366211, "attnres/block_norm/5": 7182.794921875, "attnres/final_alpha/block_6": 0.09897539764642715, "attnres/block_norm/6": 48366.453125, "geo/tier1_time_s": 1.3600902557373047, "geo/step": 10950.0, "geo/rankme_slope": 0.00039642775078781514} {"step": 10960, "timestamp": 1778337557.146727, "train/loss": 2.3165292024612425, "train/z_loss": 0.0013580830651335417, "train/perplexity": 10.140417820191738, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786161.9645965567, "perf/iters_per_sec": 0.8517083952887329, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.174110770225525, "data/tokens_consumed": 22986883072, "data/tokens_consumed_B": 22.986883072, "train/loss_slope": -4.269830021027127e-06} {"step": 10970, "timestamp": 1778337567.5295615, "train/loss": 2.36932213306427, "train/z_loss": 0.0013509600074030458, "train/perplexity": 10.690143332772914, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021134.2372814994, "perf/iters_per_sec": 0.9637519060523507, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037611436843872, "data/tokens_consumed": 23007854592, "data/tokens_consumed_B": 23.007854592, "train/loss_slope": -3.6611042114267984e-06} {"step": 10980, "timestamp": 1778337577.9070647, "train/loss": 2.3322315216064453, "train/z_loss": 0.0013616442680358886, "train/perplexity": 10.300902591164625, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022233.1149419767, "perf/iters_per_sec": 0.9642758917531856, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370476007461549, "data/tokens_consumed": 23028826112, "data/tokens_consumed_B": 23.028826112, "train/loss_slope": -3.7814665370041553e-06} {"step": 10990, "timestamp": 1778337588.2935882, "train/loss": 2.3248966932296753, "train/z_loss": 0.001355835096910596, "train/perplexity": 10.225623655069354, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020074.4549423207, "perf/iters_per_sec": 0.9632465624534229, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381557941436768, "data/tokens_consumed": 23049797632, "data/tokens_consumed_B": 23.049797632, "train/loss_slope": -5.2419770346939346e-06} {"step": 11000, "timestamp": 1778337598.6605911, "grad/layer_0/attn": 0.003297836985439062, "grad/layer_0/mlp": 0.003632321720942855, "grad/layer_0/attn_mlp_ratio": 0.9079143170697993, "grad/layer_4/attn": 0.0028948490507900715, "grad/layer_4/mlp": 0.002641831524670124, "grad/layer_4/attn_mlp_ratio": 1.0957734867571436, "grad/layer_8/attn": 0.004133586771786213, "grad/layer_8/mlp": 0.0034418299328535795, "grad/layer_8/attn_mlp_ratio": 1.2009851539237997, "grad/layer_12/attn": 0.010850263759493828, "grad/layer_12/mlp": 0.00704263336956501, "grad/layer_12/attn_mlp_ratio": 1.5406543314207146, "grad/layer_16/attn": 0.0033895967062562704, "grad/layer_16/mlp": 0.004332453478127718, "grad/layer_16/attn_mlp_ratio": 0.7823734623190342, "grad/layer_20/attn": 0.0029927233699709177, "grad/layer_20/mlp": 0.0052752019837498665, "grad/layer_20/attn_mlp_ratio": 0.5673191893804278, "grad/layer_24/attn": 0.004463578574359417, "grad/layer_24/mlp": 0.007576728705316782, "grad/layer_24/attn_mlp_ratio": 0.5891168456797888, "grad/layer_27/attn": 0.0047065140679478645, "grad/layer_27/mlp": 0.006660281680524349, "grad/layer_27/attn_mlp_ratio": 0.7066538958922743} {"step": 11000, "timestamp": 1778337598.676127, "train/loss": 2.320832109451294, "train/z_loss": 0.0013636611285619438, "train/perplexity": 10.184145104663722, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021128.9430342698, "perf/iters_per_sec": 0.963749381558547, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376141548156739, "data/tokens_consumed": 23070769152, "data/tokens_consumed_B": 23.070769152, "train/loss_slope": -7.595495815717841e-06} {"step": 11000, "timestamp": 1778337605.6917067, "geo/ww_alpha_mean": 7.595874699098443, "geo/ww_alpha_std": 4.296669109967457, "geo/ww_alpha_min": 1.354261185979241, "geo/ww_alpha_max": 23.526984317403997, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.8557194398586043, "geo/ww_alpha_by_type/k_proj": 4.549481947074926, "geo/ww_alpha_by_type/v_proj": 9.392792540716377, "geo/ww_alpha_by_type/o_proj": 8.551662985018126, "geo/ww_alpha_by_type/gate_proj": 7.6660355297934055, "geo/ww_alpha_by_type/up_proj": 11.348470889116117, "geo/ww_alpha_by_type/down_proj": 7.8998930912735315, "geo/twonn_id/layer_0": 0.7532987594604492, "geo/twonn_id/layer_7": 3.188762664794922, "geo/twonn_id/layer_14": 5.705164909362793, "geo/twonn_id/layer_21": 8.14535140991211, "geo/twonn_id/layer_27": 5.648656845092773, "geo/tier2_time_s": 7.00704288482666} {"step": 11000, "timestamp": 1778337606.4403229, "eoc/jacobian_sigma/layer_0/attn": 1487.490478515625, "eoc/jacobian_sigma/layer_0/mlp": 11047.900390625, "eoc/jacobian_sigma/layer_0": 11047.900390625, "eoc/jacobian_sigma/layer_7/attn": 1.1337357759475708, "eoc/jacobian_sigma/layer_7/mlp": 1.7827496528625488, "eoc/jacobian_sigma/layer_7": 1.7827496528625488, "eoc/jacobian_sigma/layer_14/attn": 2.4436469078063965, "eoc/jacobian_sigma/layer_14/mlp": 16.332204818725586, "eoc/jacobian_sigma/layer_14": 16.332204818725586, "eoc/jacobian_sigma/layer_21/attn": 1.0957928895950317, "eoc/jacobian_sigma/layer_21/mlp": 5.323999881744385, "eoc/jacobian_sigma/layer_21": 5.323999881744385, "eoc/jacobian_sigma/layer_27/attn": 3.7635626792907715, "eoc/jacobian_sigma/layer_27/mlp": 26.950307846069336, "eoc/jacobian_sigma/layer_27": 26.950307846069336, "eoc/layer0_sigma": 11047.900390625, "eoc/sigma_max": 26.950307846069336, "eoc/sigma_min": 1.7827496528625488, "eoc/sigma_mean": 12.597315549850464, "eoc/time_s": 0.7425158023834229} {"step": 11010, "timestamp": 1778337616.841024, "train/loss": 2.284971070289612, "train/z_loss": 0.001365440513473004, "train/perplexity": 9.8254019738722, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1154976.9713746265, "perf/iters_per_sec": 0.550735936820329, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8157522201538085, "data/tokens_consumed": 23091740672, "data/tokens_consumed_B": 23.091740672, "train/loss_slope": -1.1412303282482875e-05} {"step": 11020, "timestamp": 1778337627.2197053, "train/loss": 2.3596972703933714, "train/z_loss": 0.001356010790914297, "train/perplexity": 10.587745743122804, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021958.1557031076, "perf/iters_per_sec": 0.9641447809711016, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371886253356934, "data/tokens_consumed": 23112712192, "data/tokens_consumed_B": 23.112712192, "train/loss_slope": -9.15837675610221e-06} {"step": 11025, "timestamp": 1778337633.0007038, "eos/sharpness": 26.74741744995117, "eos/L0_probe": 2.3169167041778564, "eos/L_plus": 2.4715335369110107, "eos/L_minus": 2.429774045944214, "eos/grad_norm": 0.11895983666181564, "eos/embed_grad_frac": 0.144265279173851, "eos/time_s": 0.6053640842437744} {"step": 11025, "timestamp": 1778337634.3776197, "geo/rankme_last": 429.7414855957031, "geo/layer_0/stable_rank_q_proj": 20.740251541137695, "geo/layer_0/stable_rank_k_proj": 17.144412994384766, "geo/layer_0/stable_rank_o_proj": 44.2451057434082, "geo/layer_0/stable_rank_gate_proj": 126.56318664550781, "geo/layer_0/stable_rank_down_proj": 57.38083267211914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06578409671783447, "geo/layer_0/attn_entropy_mean": 6.238833427429199, "geo/layer_0/attn_entropy_std": 0.4594820737838745, "geo/layer_7/stable_rank_q_proj": 42.19242477416992, "geo/layer_7/stable_rank_k_proj": 38.88759231567383, "geo/layer_7/stable_rank_o_proj": 88.77738189697266, "geo/layer_7/stable_rank_gate_proj": 78.6244125366211, "geo/layer_7/stable_rank_down_proj": 144.62539672851562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40732911229133606, "geo/layer_7/attn_entropy_mean": 4.745238304138184, "geo/layer_7/attn_entropy_std": 0.7643279433250427, "geo/layer_14/stable_rank_q_proj": 51.44879913330078, "geo/layer_14/stable_rank_k_proj": 42.76510238647461, "geo/layer_14/stable_rank_o_proj": 42.6387825012207, "geo/layer_14/stable_rank_gate_proj": 71.82510375976562, "geo/layer_14/stable_rank_down_proj": 127.63687896728516, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3660295009613037, "geo/layer_14/attn_entropy_mean": 5.532069206237793, "geo/layer_14/attn_entropy_std": 0.4619619846343994, "geo/layer_21/stable_rank_q_proj": 38.49501037597656, "geo/layer_21/stable_rank_k_proj": 28.57725715637207, "geo/layer_21/stable_rank_o_proj": 65.7545166015625, "geo/layer_21/stable_rank_gate_proj": 60.63084411621094, "geo/layer_21/stable_rank_down_proj": 49.055084228515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13833633065223694, "geo/layer_21/attn_entropy_mean": 5.854702949523926, "geo/layer_21/attn_entropy_std": 0.33094412088394165, "geo/layer_27/stable_rank_q_proj": 44.274208068847656, "geo/layer_27/stable_rank_k_proj": 30.22969627380371, "geo/layer_27/stable_rank_o_proj": 107.40562438964844, "geo/layer_27/stable_rank_gate_proj": 70.50780487060547, "geo/layer_27/stable_rank_down_proj": 129.59197998046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10616381466388702, "geo/layer_27/attn_entropy_mean": 4.317203044891357, "geo/layer_27/attn_entropy_std": 0.685617208480835, "attnres/final_alpha/block_0": 0.2593965232372284, "attnres/block_norm/0": 1.7809982299804688, "attnres/final_alpha/block_1": 0.0039034646470099688, "attnres/block_norm/1": 50314.97265625, "attnres/final_alpha/block_2": 0.008477773517370224, "attnres/block_norm/2": 30069.744140625, "attnres/final_alpha/block_3": 0.010484175756573677, "attnres/block_norm/3": 72426.21875, "attnres/final_alpha/block_4": 0.011685336008667946, "attnres/block_norm/4": 17392.6328125, "attnres/final_alpha/block_5": 0.6062743663787842, "attnres/block_norm/5": 7247.52587890625, "attnres/final_alpha/block_6": 0.09977833926677704, "attnres/block_norm/6": 48475.02734375, "geo/tier1_time_s": 1.3573813438415527, "geo/step": 11025.0, "geo/rankme_slope": 0.0004088464487357443} {"step": 11030, "timestamp": 1778337639.5664523, "train/loss": 2.320893383026123, "train/z_loss": 0.0013622354716062545, "train/perplexity": 10.18476914275919, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699399.4383208086, "perf/iters_per_sec": 0.8103367988208812, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2340547800064088, "data/tokens_consumed": 23133683712, "data/tokens_consumed_B": 23.133683712, "train/loss_slope": -1.0466299348860454e-05} {"step": 11040, "timestamp": 1778337649.9426033, "train/loss": 2.3246177196502686, "train/z_loss": 0.0013641942292451858, "train/perplexity": 10.222771374110646, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022003.1016611077, "perf/iters_per_sec": 0.9641662128739871, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371655702590943, "data/tokens_consumed": 23154655232, "data/tokens_consumed_B": 23.154655232, "train/loss_slope": -1.097533995419204e-05} {"step": 11050, "timestamp": 1778337660.3219838, "grad/layer_0/attn": 0.0032174624502658844, "grad/layer_0/mlp": 0.003451479133218527, "grad/layer_0/attn_mlp_ratio": 0.9321981193743355, "grad/layer_4/attn": 0.0034606843255460262, "grad/layer_4/mlp": 0.0026683597825467587, "grad/layer_4/attn_mlp_ratio": 1.2969331266677007, "grad/layer_8/attn": 0.003679840127006173, "grad/layer_8/mlp": 0.0034161475487053394, "grad/layer_8/attn_mlp_ratio": 1.0771900120888418, "grad/layer_12/attn": 0.0165961105376482, "grad/layer_12/mlp": 0.007285970728844404, "grad/layer_12/attn_mlp_ratio": 2.27781731872243, "grad/layer_16/attn": 0.0037689933087676764, "grad/layer_16/mlp": 0.005178174469619989, "grad/layer_16/attn_mlp_ratio": 0.7278613839865719, "grad/layer_20/attn": 0.003516380675137043, "grad/layer_20/mlp": 0.006660835817456245, "grad/layer_20/attn_mlp_ratio": 0.527918825611897, "grad/layer_24/attn": 0.009934579953551292, "grad/layer_24/mlp": 0.009975677356123924, "grad/layer_24/attn_mlp_ratio": 0.9958802294126499, "grad/layer_27/attn": 0.007221870590001345, "grad/layer_27/mlp": 0.010352634824812412, "grad/layer_27/attn_mlp_ratio": 0.6975876810542707} {"step": 11050, "timestamp": 1778337660.3375485, "train/loss": 2.2974709033966065, "train/z_loss": 0.0013571776682510972, "train/perplexity": 9.948988656051048, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018815.5989835523, "perf/iters_per_sec": 0.9626462931554567, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0388031482696534, "data/tokens_consumed": 23175626752, "data/tokens_consumed_B": 23.175626752, "train/loss_slope": -1.3902032851028406e-05} {"step": 11060, "timestamp": 1778337670.7138622, "train/loss": 2.3448366641998293, "train/z_loss": 0.0013583353837020695, "train/perplexity": 10.431568739764455, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022355.3017228625, "perf/iters_per_sec": 0.9643341549505532, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369849443435668, "data/tokens_consumed": 23196598272, "data/tokens_consumed_B": 23.196598272, "train/loss_slope": -1.4993010989331414e-05} {"step": 11070, "timestamp": 1778337681.094212, "train/loss": 2.353846001625061, "train/z_loss": 0.0013615333940833808, "train/perplexity": 10.525974892293448, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021274.591387439, "perf/iters_per_sec": 0.9638188321053691, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375393867492675, "data/tokens_consumed": 23217569792, "data/tokens_consumed_B": 23.217569792, "train/loss_slope": -1.3881956261269498e-05} {"step": 11080, "timestamp": 1778337691.4826896, "train/loss": 2.3360209226608277, "train/z_loss": 0.001362832321319729, "train/perplexity": 10.34001089402857, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020116.5335414917, "perf/iters_per_sec": 0.9632666270930728, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0381341695785522, "data/tokens_consumed": 23238541312, "data/tokens_consumed_B": 23.238541312, "train/loss_slope": -1.2211515522203514e-05} {"step": 11090, "timestamp": 1778337701.8637323, "train/loss": 2.3722049474716185, "train/z_loss": 0.0013564854627475143, "train/perplexity": 10.721005495559375, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021452.7326238086, "perf/iters_per_sec": 0.9639037764662783, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037447953224182, "data/tokens_consumed": 23259512832, "data/tokens_consumed_B": 23.259512832, "train/loss_slope": -1.1419167648805347e-05} {"step": 11100, "timestamp": 1778337712.236457, "grad/layer_0/attn": 0.007385279517620802, "grad/layer_0/mlp": 0.005399995017796755, "grad/layer_0/attn_mlp_ratio": 1.3676455916193608, "grad/layer_4/attn": 0.0031805280596017838, "grad/layer_4/mlp": 0.002805819734930992, "grad/layer_4/attn_mlp_ratio": 1.1335467872904275, "grad/layer_8/attn": 0.008850548416376114, "grad/layer_8/mlp": 0.003733662888407707, "grad/layer_8/attn_mlp_ratio": 2.3704732976316647, "grad/layer_12/attn": 0.005866637919098139, "grad/layer_12/mlp": 0.007796351332217455, "grad/layer_12/attn_mlp_ratio": 0.7524850527972592, "grad/layer_16/attn": 0.004055440425872803, "grad/layer_16/mlp": 0.0052176713943481445, "grad/layer_16/attn_mlp_ratio": 0.7772510075166122, "grad/layer_20/attn": 0.0029156000819057226, "grad/layer_20/mlp": 0.006213786080479622, "grad/layer_20/attn_mlp_ratio": 0.4692147423844374, "grad/layer_24/attn": 0.007109837606549263, "grad/layer_24/mlp": 0.010016650892794132, "grad/layer_24/attn_mlp_ratio": 0.7098018700725424, "grad/layer_27/attn": 0.004600688349455595, "grad/layer_27/mlp": 0.010281241498887539, "grad/layer_27/attn_mlp_ratio": 0.447483730948741} {"step": 11100, "timestamp": 1778337712.8485744, "eos/sharpness": 46.12698554992675, "eos/L0_probe": 2.3174195289611816, "eos/L_plus": 2.5769670009613037, "eos/L_minus": 2.519141912460327, "eos/grad_norm": 0.16949312388896942, "eos/embed_grad_frac": 0.16499339044094086, "eos/time_s": 0.6090500354766846} {"step": 11100, "timestamp": 1778337712.8699682, "train/loss": 2.3430232286453245, "train/z_loss": 0.0013620895333588123, "train/perplexity": 10.41266890411889, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906204.0547773452, "perf/iters_per_sec": 0.9089489244353033, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1001718282699584, "data/tokens_consumed": 23280484352, "data/tokens_consumed_B": 23.280484352, "train/loss_slope": -1.2234423692041202e-05} {"step": 11100, "timestamp": 1778337714.2364764, "geo/rankme_last": 429.5218505859375, "geo/layer_0/stable_rank_q_proj": 20.731338500976562, "geo/layer_0/stable_rank_k_proj": 17.150182723999023, "geo/layer_0/stable_rank_o_proj": 44.27320861816406, "geo/layer_0/stable_rank_gate_proj": 126.5822525024414, "geo/layer_0/stable_rank_down_proj": 57.413570404052734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06459066271781921, "geo/layer_0/attn_entropy_mean": 6.23825216293335, "geo/layer_0/attn_entropy_std": 0.4558270573616028, "geo/layer_7/stable_rank_q_proj": 42.2254524230957, "geo/layer_7/stable_rank_k_proj": 38.89564514160156, "geo/layer_7/stable_rank_o_proj": 88.82970428466797, "geo/layer_7/stable_rank_gate_proj": 78.49427795410156, "geo/layer_7/stable_rank_down_proj": 144.67935180664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.42046281695365906, "geo/layer_7/attn_entropy_mean": 4.7343645095825195, "geo/layer_7/attn_entropy_std": 0.7820753455162048, "geo/layer_14/stable_rank_q_proj": 51.47819137573242, "geo/layer_14/stable_rank_k_proj": 42.80685043334961, "geo/layer_14/stable_rank_o_proj": 42.64641571044922, "geo/layer_14/stable_rank_gate_proj": 71.91925048828125, "geo/layer_14/stable_rank_down_proj": 127.51954650878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3804113566875458, "geo/layer_14/attn_entropy_mean": 5.5367350578308105, "geo/layer_14/attn_entropy_std": 0.47577840089797974, "geo/layer_21/stable_rank_q_proj": 38.45178985595703, "geo/layer_21/stable_rank_k_proj": 28.57493782043457, "geo/layer_21/stable_rank_o_proj": 65.74933624267578, "geo/layer_21/stable_rank_gate_proj": 60.520965576171875, "geo/layer_21/stable_rank_down_proj": 49.0247802734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1345304697751999, "geo/layer_21/attn_entropy_mean": 5.841899871826172, "geo/layer_21/attn_entropy_std": 0.32907745242118835, "geo/layer_27/stable_rank_q_proj": 44.33828353881836, "geo/layer_27/stable_rank_k_proj": 30.225868225097656, "geo/layer_27/stable_rank_o_proj": 107.54854583740234, "geo/layer_27/stable_rank_gate_proj": 70.53064727783203, "geo/layer_27/stable_rank_down_proj": 129.80433654785156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10058007389307022, "geo/layer_27/attn_entropy_mean": 4.337808609008789, "geo/layer_27/attn_entropy_std": 0.664673388004303, "attnres/final_alpha/block_0": 0.25798386335372925, "attnres/block_norm/0": 1.7809569835662842, "attnres/final_alpha/block_1": 0.003749632742255926, "attnres/block_norm/1": 50769.71875, "attnres/final_alpha/block_2": 0.00826335046440363, "attnres/block_norm/2": 29959.380859375, "attnres/final_alpha/block_3": 0.010323980823159218, "attnres/block_norm/3": 72178.4609375, "attnres/final_alpha/block_4": 0.011721810325980186, "attnres/block_norm/4": 17269.0078125, "attnres/final_alpha/block_5": 0.6100682020187378, "attnres/block_norm/5": 7100.7802734375, "attnres/final_alpha/block_6": 0.09788915514945984, "attnres/block_norm/6": 48074.09375, "geo/tier1_time_s": 1.362309217453003, "geo/step": 11100.0, "geo/rankme_slope": 0.0004214675714035614} {"step": 11110, "timestamp": 1778337724.6200016, "train/loss": 2.3685718297958376, "train/z_loss": 0.0013589778915047645, "train/perplexity": 10.682125491572691, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785405.2141335048, "perf/iters_per_sec": 0.8513475485484623, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1746084213256835, "data/tokens_consumed": 23301455872, "data/tokens_consumed_B": 23.301455872, "train/loss_slope": -1.4125459629817718e-05} {"step": 11120, "timestamp": 1778337735.0051234, "train/loss": 2.310994052886963, "train/z_loss": 0.0013606988824903965, "train/perplexity": 10.084444145052835, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020347.185092623, "perf/iters_per_sec": 0.9633766103232494, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038015651702881, "data/tokens_consumed": 23322427392, "data/tokens_consumed_B": 23.322427392, "train/loss_slope": -1.5764246693681874e-05} {"step": 11130, "timestamp": 1778337745.387428, "train/loss": 2.322129416465759, "train/z_loss": 0.0013668118393979968, "train/perplexity": 10.197365641237562, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020885.1126345498, "perf/iters_per_sec": 0.9636331141636609, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03773934841156, "data/tokens_consumed": 23343398912, "data/tokens_consumed_B": 23.343398912, "train/loss_slope": -1.540912330025429e-05} {"step": 11140, "timestamp": 1778337755.7803228, "train/loss": 2.305838966369629, "train/z_loss": 0.001357275724876672, "train/perplexity": 10.03259172968064, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019208.9619534644, "perf/iters_per_sec": 0.9628338632361719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038600778579712, "data/tokens_consumed": 23364370432, "data/tokens_consumed_B": 23.364370432, "train/loss_slope": -1.7635259576792396e-05} {"step": 11150, "timestamp": 1778337766.1486003, "grad/layer_0/attn": 0.003616493893787265, "grad/layer_0/mlp": 0.0033983909524977207, "grad/layer_0/attn_mlp_ratio": 1.064178264926058, "grad/layer_4/attn": 0.0020272526890039444, "grad/layer_4/mlp": 0.0028025549836456776, "grad/layer_4/attn_mlp_ratio": 0.7233587310501001, "grad/layer_8/attn": 0.0034049341920763254, "grad/layer_8/mlp": 0.0034762900322675705, "grad/layer_8/attn_mlp_ratio": 0.9794735371685734, "grad/layer_12/attn": 0.00828128308057785, "grad/layer_12/mlp": 0.007371158339083195, "grad/layer_12/attn_mlp_ratio": 1.1234710458357553, "grad/layer_16/attn": 0.003707407508045435, "grad/layer_16/mlp": 0.004839428700506687, "grad/layer_16/attn_mlp_ratio": 0.7660836972448629, "grad/layer_20/attn": 0.003433351870626211, "grad/layer_20/mlp": 0.00653769401833415, "grad/layer_20/attn_mlp_ratio": 0.5251625127272019, "grad/layer_24/attn": 0.015267834067344666, "grad/layer_24/mlp": 0.012752380222082138, "grad/layer_24/attn_mlp_ratio": 1.1972536641576432, "grad/layer_27/attn": 0.008417203091084957, "grad/layer_27/mlp": 0.014054409228265285, "grad/layer_27/attn_mlp_ratio": 0.5989012340886388} {"step": 11150, "timestamp": 1778337766.1644566, "train/loss": 2.3414982080459597, "train/z_loss": 0.0013528117793612181, "train/perplexity": 10.396801471701615, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020756.2791552923, "perf/iters_per_sec": 0.9635716815735303, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0378055095672607, "data/tokens_consumed": 23385341952, "data/tokens_consumed_B": 23.385341952, "train/loss_slope": -1.8315094310601453e-05} {"step": 11160, "timestamp": 1778337776.5467987, "train/loss": 2.3578295946121215, "train/z_loss": 0.0013442780124023556, "train/perplexity": 10.567989721484683, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020927.457170105, "perf/iters_per_sec": 0.9636533056116605, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037717604637146, "data/tokens_consumed": 23406313472, "data/tokens_consumed_B": 23.406313472, "train/loss_slope": -1.8653123442417833e-05} {"step": 11170, "timestamp": 1778337786.9313605, "train/loss": 2.3068699836730957, "train/z_loss": 0.0013522196793928742, "train/perplexity": 10.042940839491449, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021011.2228599226, "perf/iters_per_sec": 0.963693248205148, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376745939254761, "data/tokens_consumed": 23427284992, "data/tokens_consumed_B": 23.427284992, "train/loss_slope": -1.9559923106759276e-05} {"step": 11175, "timestamp": 1778337792.7190294, "eos/sharpness": 11.674952507019041, "eos/L0_probe": 2.314821481704712, "eos/L_plus": 2.367117166519165, "eos/L_minus": 2.379275321960449, "eos/grad_norm": 0.08716751635074615, "eos/embed_grad_frac": 0.29199615120887756, "eos/time_s": 0.6005454063415527} {"step": 11175, "timestamp": 1778337794.0945072, "geo/rankme_last": 429.5751037597656, "geo/layer_0/stable_rank_q_proj": 20.739404678344727, "geo/layer_0/stable_rank_k_proj": 17.135732650756836, "geo/layer_0/stable_rank_o_proj": 44.27527618408203, "geo/layer_0/stable_rank_gate_proj": 126.53056335449219, "geo/layer_0/stable_rank_down_proj": 57.37974166870117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06565184891223907, "geo/layer_0/attn_entropy_mean": 6.243260383605957, "geo/layer_0/attn_entropy_std": 0.45510369539260864, "geo/layer_7/stable_rank_q_proj": 42.210357666015625, "geo/layer_7/stable_rank_k_proj": 38.808631896972656, "geo/layer_7/stable_rank_o_proj": 88.76734924316406, "geo/layer_7/stable_rank_gate_proj": 78.67928314208984, "geo/layer_7/stable_rank_down_proj": 144.60243225097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39380359649658203, "geo/layer_7/attn_entropy_mean": 4.753057956695557, "geo/layer_7/attn_entropy_std": 0.7538830637931824, "geo/layer_14/stable_rank_q_proj": 51.569766998291016, "geo/layer_14/stable_rank_k_proj": 42.83479690551758, "geo/layer_14/stable_rank_o_proj": 42.64076614379883, "geo/layer_14/stable_rank_gate_proj": 71.9437255859375, "geo/layer_14/stable_rank_down_proj": 127.68754577636719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36994245648384094, "geo/layer_14/attn_entropy_mean": 5.546224594116211, "geo/layer_14/attn_entropy_std": 0.4687386751174927, "geo/layer_21/stable_rank_q_proj": 38.44284439086914, "geo/layer_21/stable_rank_k_proj": 28.617403030395508, "geo/layer_21/stable_rank_o_proj": 65.7437744140625, "geo/layer_21/stable_rank_gate_proj": 60.520015716552734, "geo/layer_21/stable_rank_down_proj": 48.93947982788086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13906548917293549, "geo/layer_21/attn_entropy_mean": 5.862799167633057, "geo/layer_21/attn_entropy_std": 0.33141714334487915, "geo/layer_27/stable_rank_q_proj": 44.23717498779297, "geo/layer_27/stable_rank_k_proj": 30.175859451293945, "geo/layer_27/stable_rank_o_proj": 107.53116607666016, "geo/layer_27/stable_rank_gate_proj": 70.59585571289062, "geo/layer_27/stable_rank_down_proj": 129.7679901123047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10386494547128677, "geo/layer_27/attn_entropy_mean": 4.3185319900512695, "geo/layer_27/attn_entropy_std": 0.675258457660675, "attnres/final_alpha/block_0": 0.2607083022594452, "attnres/block_norm/0": 1.7808852195739746, "attnres/final_alpha/block_1": 0.0039579616859555244, "attnres/block_norm/1": 50487.4921875, "attnres/final_alpha/block_2": 0.008580385707318783, "attnres/block_norm/2": 29946.3984375, "attnres/final_alpha/block_3": 0.010603480041027069, "attnres/block_norm/3": 71227.421875, "attnres/final_alpha/block_4": 0.012030239216983318, "attnres/block_norm/4": 17403.10546875, "attnres/final_alpha/block_5": 0.6025687456130981, "attnres/block_norm/5": 7267.421875, "attnres/final_alpha/block_6": 0.10155089199542999, "attnres/block_norm/6": 48300.0546875, "geo/tier1_time_s": 1.3560278415679932, "geo/step": 11175.0, "geo/rankme_slope": 0.0004408488981530112} {"step": 11180, "timestamp": 1778337799.2851315, "train/loss": 2.3364143133163453, "train/z_loss": 0.0013654676033183933, "train/perplexity": 10.344079357887596, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698442.1287758101, "perf/iters_per_sec": 0.8098803180579234, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2347503423690795, "data/tokens_consumed": 23448256512, "data/tokens_consumed_B": 23.448256512, "train/loss_slope": -2.0013224452671435e-05} {"step": 11190, "timestamp": 1778337809.6710246, "train/loss": 2.343484878540039, "train/z_loss": 0.0013476125313900411, "train/perplexity": 10.417477021370193, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020336.5584665688, "perf/iters_per_sec": 0.9633715431530804, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380211114883422, "data/tokens_consumed": 23469228032, "data/tokens_consumed_B": 23.469228032, "train/loss_slope": -2.0052367776069023e-05} {"step": 11200, "timestamp": 1778337820.0425637, "grad/layer_0/attn": 0.0041082813404500484, "grad/layer_0/mlp": 0.004103685263544321, "grad/layer_0/attn_mlp_ratio": 1.0011199632765604, "grad/layer_4/attn": 0.0023476092610508204, "grad/layer_4/mlp": 0.0028105410747230053, "grad/layer_4/attn_mlp_ratio": 0.8352872685745977, "grad/layer_8/attn": 0.003948103170841932, "grad/layer_8/mlp": 0.0035869793500751257, "grad/layer_8/attn_mlp_ratio": 1.1006762725555184, "grad/layer_12/attn": 0.005653858184814453, "grad/layer_12/mlp": 0.007511062081903219, "grad/layer_12/attn_mlp_ratio": 0.7527375020854679, "grad/layer_16/attn": 0.0034480486065149307, "grad/layer_16/mlp": 0.0047619398683309555, "grad/layer_16/attn_mlp_ratio": 0.7240848539557421, "grad/layer_20/attn": 0.003520800732076168, "grad/layer_20/mlp": 0.006056065205484629, "grad/layer_20/attn_mlp_ratio": 0.5813676957689975, "grad/layer_24/attn": 0.013132357969880104, "grad/layer_24/mlp": 0.011509996838867664, "grad/layer_24/attn_mlp_ratio": 1.1409523425270385, "grad/layer_27/attn": 0.005291648209095001, "grad/layer_27/mlp": 0.011131023056805134, "grad/layer_27/attn_mlp_ratio": 0.475396388503591} {"step": 11200, "timestamp": 1778337820.0582108, "train/loss": 2.3259089469909666, "train/z_loss": 0.001363244722597301, "train/perplexity": 10.23597982172578, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020400.644749977, "perf/iters_per_sec": 0.9634021018743405, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0379881858825684, "data/tokens_consumed": 23490199552, "data/tokens_consumed_B": 23.490199552, "train/loss_slope": -2.352126396492803e-05} {"step": 11210, "timestamp": 1778337830.4471576, "train/loss": 2.3511791706085203, "train/z_loss": 0.0013463399023748933, "train/perplexity": 10.497941293021924, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019667.9555440792, "perf/iters_per_sec": 0.9630527284355541, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383647441864015, "data/tokens_consumed": 23511171072, "data/tokens_consumed_B": 23.511171072, "train/loss_slope": -2.2372249150612436e-05} {"step": 11220, "timestamp": 1778337840.821954, "train/loss": 2.338024115562439, "train/z_loss": 0.0013520010164938868, "train/perplexity": 10.360744690417597, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022482.1535445498, "perf/iters_per_sec": 0.9643946426127195, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036919903755188, "data/tokens_consumed": 23532142592, "data/tokens_consumed_B": 23.532142592, "train/loss_slope": -2.145296121695151e-05} {"step": 11230, "timestamp": 1778337851.1972005, "train/loss": 2.2968234062194823, "train/z_loss": 0.001354197191540152, "train/perplexity": 9.942548799100607, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022562.002092626, "perf/iters_per_sec": 0.9644327173674707, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368789672851562, "data/tokens_consumed": 23553114112, "data/tokens_consumed_B": 23.553114112, "train/loss_slope": -2.174899298401525e-05} {"step": 11240, "timestamp": 1778337861.5779526, "train/loss": 2.3287137269973757, "train/z_loss": 0.0013714402215555311, "train/perplexity": 10.264729793100713, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021169.4400868386, "perf/iters_per_sec": 0.9637686920580094, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375933647155762, "data/tokens_consumed": 23574085632, "data/tokens_consumed_B": 23.574085632, "train/loss_slope": -1.9752685279056523e-05} {"step": 11250, "timestamp": 1778337871.9236243, "grad/layer_0/attn": 0.002886766567826271, "grad/layer_0/mlp": 0.0031592377927154303, "grad/layer_0/attn_mlp_ratio": 0.9137540969873071, "grad/layer_4/attn": 0.0020016322378069162, "grad/layer_4/mlp": 0.00268848380073905, "grad/layer_4/attn_mlp_ratio": 0.7445208198035622, "grad/layer_8/attn": 0.00977896898984909, "grad/layer_8/mlp": 0.003906790632754564, "grad/layer_8/attn_mlp_ratio": 2.503069567525626, "grad/layer_12/attn": 0.005151606164872646, "grad/layer_12/mlp": 0.007434939965605736, "grad/layer_12/attn_mlp_ratio": 0.6928914180094251, "grad/layer_16/attn": 0.0064196339808404446, "grad/layer_16/mlp": 0.004828843753784895, "grad/layer_16/attn_mlp_ratio": 1.329434990077112, "grad/layer_20/attn": 0.002580735832452774, "grad/layer_20/mlp": 0.005524521693587303, "grad/layer_20/attn_mlp_ratio": 0.46714194076461313, "grad/layer_24/attn": 0.005860472563654184, "grad/layer_24/mlp": 0.007992823608219624, "grad/layer_24/attn_mlp_ratio": 0.733216792662075, "grad/layer_27/attn": 0.006825050804764032, "grad/layer_27/mlp": 0.007704922929406166, "grad/layer_27/attn_mlp_ratio": 0.8858038917087083} {"step": 11250, "timestamp": 1778337872.5316217, "eos/sharpness": 6.932663917541502, "eos/L0_probe": 2.3129358291625977, "eos/L_plus": 2.354778528213501, "eos/L_minus": 2.3404197692871094, "eos/grad_norm": 0.10023190081119537, "eos/embed_grad_frac": 0.2152189314365387, "eos/time_s": 0.6051805019378662} {"step": 11250, "timestamp": 1778337872.5512176, "train/loss": 2.2872739315032957, "train/z_loss": 0.0013690022053197027, "train/perplexity": 9.848054583884288, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912157.4301536777, "perf/iters_per_sec": 0.9117877150314702, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967465162277221, "data/tokens_consumed": 23595057152, "data/tokens_consumed_B": 23.595057152, "train/loss_slope": -2.3038534083739855e-05} {"step": 11250, "timestamp": 1778337873.9164464, "geo/rankme_last": 429.8418884277344, "geo/layer_0/stable_rank_q_proj": 20.72661018371582, "geo/layer_0/stable_rank_k_proj": 17.09834098815918, "geo/layer_0/stable_rank_o_proj": 44.18247985839844, "geo/layer_0/stable_rank_gate_proj": 126.47782135009766, "geo/layer_0/stable_rank_down_proj": 57.307247161865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06508303433656693, "geo/layer_0/attn_entropy_mean": 6.236748695373535, "geo/layer_0/attn_entropy_std": 0.4518606960773468, "geo/layer_7/stable_rank_q_proj": 42.14982604980469, "geo/layer_7/stable_rank_k_proj": 38.74864196777344, "geo/layer_7/stable_rank_o_proj": 88.6714096069336, "geo/layer_7/stable_rank_gate_proj": 78.81629943847656, "geo/layer_7/stable_rank_down_proj": 144.08018493652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.406025230884552, "geo/layer_7/attn_entropy_mean": 4.742105484008789, "geo/layer_7/attn_entropy_std": 0.7801140546798706, "geo/layer_14/stable_rank_q_proj": 51.71507263183594, "geo/layer_14/stable_rank_k_proj": 42.887123107910156, "geo/layer_14/stable_rank_o_proj": 42.58442687988281, "geo/layer_14/stable_rank_gate_proj": 72.02644348144531, "geo/layer_14/stable_rank_down_proj": 127.44084167480469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37440237402915955, "geo/layer_14/attn_entropy_mean": 5.542344093322754, "geo/layer_14/attn_entropy_std": 0.46811309456825256, "geo/layer_21/stable_rank_q_proj": 38.419185638427734, "geo/layer_21/stable_rank_k_proj": 28.558300018310547, "geo/layer_21/stable_rank_o_proj": 65.75249481201172, "geo/layer_21/stable_rank_gate_proj": 60.562435150146484, "geo/layer_21/stable_rank_down_proj": 48.93146514892578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13446328043937683, "geo/layer_21/attn_entropy_mean": 5.8377766609191895, "geo/layer_21/attn_entropy_std": 0.328175812959671, "geo/layer_27/stable_rank_q_proj": 44.203887939453125, "geo/layer_27/stable_rank_k_proj": 30.184646606445312, "geo/layer_27/stable_rank_o_proj": 107.60169982910156, "geo/layer_27/stable_rank_gate_proj": 70.55107116699219, "geo/layer_27/stable_rank_down_proj": 129.71786499023438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09735109657049179, "geo/layer_27/attn_entropy_mean": 4.315543174743652, "geo/layer_27/attn_entropy_std": 0.6768833994865417, "attnres/final_alpha/block_0": 0.2590663433074951, "attnres/block_norm/0": 1.7808308601379395, "attnres/final_alpha/block_1": 0.003875530092045665, "attnres/block_norm/1": 50569.515625, "attnres/final_alpha/block_2": 0.0084803132340312, "attnres/block_norm/2": 29877.201171875, "attnres/final_alpha/block_3": 0.010497190058231354, "attnres/block_norm/3": 71954.0625, "attnres/final_alpha/block_4": 0.011799746192991734, "attnres/block_norm/4": 17365.76171875, "attnres/final_alpha/block_5": 0.6065391898155212, "attnres/block_norm/5": 7194.43359375, "attnres/final_alpha/block_6": 0.09974172711372375, "attnres/block_norm/6": 48254.61328125, "geo/tier1_time_s": 1.360804796218872, "geo/step": 11250.0, "geo/rankme_slope": 0.00043612278895933373} {"step": 11260, "timestamp": 1778337884.27072, "train/loss": 2.3365971088409423, "train/z_loss": 0.0013612825539894402, "train/perplexity": 10.345970382130412, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790088.5745036001, "perf/iters_per_sec": 0.8535807487981797, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171535325050354, "data/tokens_consumed": 23616028672, "data/tokens_consumed_B": 23.616028672, "train/loss_slope": -2.394385123231416e-05} {"step": 11270, "timestamp": 1778337894.6330605, "train/loss": 2.3573844909667967, "train/z_loss": 0.0013618874247185886, "train/perplexity": 10.563286917431151, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024741.7376018164, "perf/iters_per_sec": 0.9654720962533075, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357627153396607, "data/tokens_consumed": 23637000192, "data/tokens_consumed_B": 23.637000192, "train/loss_slope": -2.0362560691112516e-05} {"step": 11280, "timestamp": 1778337904.9908059, "train/loss": 2.355259370803833, "train/z_loss": 0.001362201850861311, "train/perplexity": 10.540862499146778, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026100.445195622, "perf/iters_per_sec": 0.9661199785211668, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350681304931642, "data/tokens_consumed": 23657971712, "data/tokens_consumed_B": 23.657971712, "train/loss_slope": -1.9772076041641997e-05} {"step": 11290, "timestamp": 1778337915.3464677, "train/loss": 2.348153018951416, "train/z_loss": 0.0013550107018090785, "train/perplexity": 10.46622095007288, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026085.2777572072, "perf/iters_per_sec": 0.9661127461229359, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035075879096985, "data/tokens_consumed": 23678943232, "data/tokens_consumed_B": 23.678943232, "train/loss_slope": -1.9028134693657403e-05} {"step": 11300, "timestamp": 1778337925.6962059, "grad/layer_0/attn": 0.002959940116852522, "grad/layer_0/mlp": 0.003137294203042984, "grad/layer_0/attn_mlp_ratio": 0.943469063129194, "grad/layer_4/attn": 0.0020400811918079853, "grad/layer_4/mlp": 0.0026564213912934065, "grad/layer_4/attn_mlp_ratio": 0.7679809843786024, "grad/layer_8/attn": 0.0034787349868565798, "grad/layer_8/mlp": 0.003373297629877925, "grad/layer_8/attn_mlp_ratio": 1.031256433739832, "grad/layer_12/attn": 0.008945319801568985, "grad/layer_12/mlp": 0.006979338359087706, "grad/layer_12/attn_mlp_ratio": 1.2816859153637121, "grad/layer_16/attn": 0.003566313534975052, "grad/layer_16/mlp": 0.004551133140921593, "grad/layer_16/attn_mlp_ratio": 0.7836100035280187, "grad/layer_20/attn": 0.004156677983701229, "grad/layer_20/mlp": 0.005828154738992453, "grad/layer_20/attn_mlp_ratio": 0.713206511929223, "grad/layer_24/attn": 0.007657499518245459, "grad/layer_24/mlp": 0.008656139485538006, "grad/layer_24/attn_mlp_ratio": 0.8846321668655857, "grad/layer_27/attn": 0.007674381602555513, "grad/layer_27/mlp": 0.007499323692172766, "grad/layer_27/attn_mlp_ratio": 1.023343146026774} {"step": 11300, "timestamp": 1778337925.7120495, "train/loss": 2.3376680612564087, "train/z_loss": 0.0013540105195716024, "train/perplexity": 10.357056359318944, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024095.4141970843, "perf/iters_per_sec": 0.9651639052377149, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360934495925904, "data/tokens_consumed": 23699914752, "data/tokens_consumed_B": 23.699914752, "train/loss_slope": -1.7639641156612965e-05} {"step": 11310, "timestamp": 1778337936.0697906, "train/loss": 2.3514787197113036, "train/z_loss": 0.0013548691058531404, "train/perplexity": 10.501086412952729, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026122.660069138, "perf/iters_per_sec": 0.966130571398324, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350567817687988, "data/tokens_consumed": 23720886272, "data/tokens_consumed_B": 23.720886272, "train/loss_slope": -1.5090361451230016e-05} {"step": 11320, "timestamp": 1778337946.4258938, "train/loss": 2.3125580072402956, "train/z_loss": 0.0013600355363450945, "train/perplexity": 10.100228094845695, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025974.9123130203, "perf/iters_per_sec": 0.966060119778166, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351322650909425, "data/tokens_consumed": 23741857792, "data/tokens_consumed_B": 23.741857792, "train/loss_slope": -1.4637102824662382e-05} {"step": 11325, "timestamp": 1778337952.196303, "eos/sharpness": 63.98127079010008, "eos/L0_probe": 2.314547538757324, "eos/L_plus": 2.5888657569885254, "eos/L_minus": 2.680042028427124, "eos/grad_norm": 0.2251962423324585, "eos/embed_grad_frac": 0.046764474362134933, "eos/time_s": 0.6030631065368652} {"step": 11325, "timestamp": 1778337953.5777097, "geo/rankme_last": 430.1820373535156, "geo/layer_0/stable_rank_q_proj": 20.706443786621094, "geo/layer_0/stable_rank_k_proj": 17.047964096069336, "geo/layer_0/stable_rank_o_proj": 44.1427001953125, "geo/layer_0/stable_rank_gate_proj": 126.21223449707031, "geo/layer_0/stable_rank_down_proj": 57.3474235534668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06465914845466614, "geo/layer_0/attn_entropy_mean": 6.229704856872559, "geo/layer_0/attn_entropy_std": 0.4559074938297272, "geo/layer_7/stable_rank_q_proj": 42.0752067565918, "geo/layer_7/stable_rank_k_proj": 38.786224365234375, "geo/layer_7/stable_rank_o_proj": 88.64522552490234, "geo/layer_7/stable_rank_gate_proj": 78.76563262939453, "geo/layer_7/stable_rank_down_proj": 144.53515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3992270231246948, "geo/layer_7/attn_entropy_mean": 4.762146949768066, "geo/layer_7/attn_entropy_std": 0.7716198563575745, "geo/layer_14/stable_rank_q_proj": 51.760276794433594, "geo/layer_14/stable_rank_k_proj": 42.875221252441406, "geo/layer_14/stable_rank_o_proj": 42.54756164550781, "geo/layer_14/stable_rank_gate_proj": 72.04695892333984, "geo/layer_14/stable_rank_down_proj": 127.50289916992188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802403509616852, "geo/layer_14/attn_entropy_mean": 5.56112003326416, "geo/layer_14/attn_entropy_std": 0.4658156931400299, "geo/layer_21/stable_rank_q_proj": 38.442691802978516, "geo/layer_21/stable_rank_k_proj": 28.592348098754883, "geo/layer_21/stable_rank_o_proj": 65.72058868408203, "geo/layer_21/stable_rank_gate_proj": 60.56676483154297, "geo/layer_21/stable_rank_down_proj": 48.941680908203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1408448964357376, "geo/layer_21/attn_entropy_mean": 5.851224899291992, "geo/layer_21/attn_entropy_std": 0.3283673822879791, "geo/layer_27/stable_rank_q_proj": 44.199398040771484, "geo/layer_27/stable_rank_k_proj": 30.179588317871094, "geo/layer_27/stable_rank_o_proj": 107.5392837524414, "geo/layer_27/stable_rank_gate_proj": 70.47443389892578, "geo/layer_27/stable_rank_down_proj": 129.8849639892578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10276003181934357, "geo/layer_27/attn_entropy_mean": 4.310563087463379, "geo/layer_27/attn_entropy_std": 0.7045305371284485, "attnres/final_alpha/block_0": 0.260983943939209, "attnres/block_norm/0": 1.7809618711471558, "attnres/final_alpha/block_1": 0.0038731873501092196, "attnres/block_norm/1": 50620.28125, "attnres/final_alpha/block_2": 0.008710821159183979, "attnres/block_norm/2": 29799.58984375, "attnres/final_alpha/block_3": 0.010599752888083458, "attnres/block_norm/3": 71927.859375, "attnres/final_alpha/block_4": 0.012121975421905518, "attnres/block_norm/4": 17426.84765625, "attnres/final_alpha/block_5": 0.6027063131332397, "attnres/block_norm/5": 7186.06494140625, "attnres/final_alpha/block_6": 0.10100404173135757, "attnres/block_norm/6": 48068.2578125, "geo/tier1_time_s": 1.3596892356872559, "geo/step": 11325.0, "geo/rankme_slope": 0.0004229958975777811} {"step": 11330, "timestamp": 1778337958.7671816, "train/loss": 2.3306430101394655, "train/z_loss": 0.0013694848865270614, "train/perplexity": 10.284552478886852, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700175.4224242764, "perf/iters_per_sec": 0.8107068168755895, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2334915399551392, "data/tokens_consumed": 23762829312, "data/tokens_consumed_B": 23.762829312, "train/loss_slope": -1.2003150874703585e-05} {"step": 11340, "timestamp": 1778337969.1217995, "train/loss": 2.3151025056838987, "train/z_loss": 0.0013655124115757645, "train/perplexity": 10.125960834086717, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026383.3945506897, "perf/iters_per_sec": 0.9662548992875527, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349236011505127, "data/tokens_consumed": 23783800832, "data/tokens_consumed_B": 23.783800832, "train/loss_slope": -9.728534136525253e-06} {"step": 11350, "timestamp": 1778337979.4699135, "grad/layer_0/attn": 0.0031709526665508747, "grad/layer_0/mlp": 0.003413331927731633, "grad/layer_0/attn_mlp_ratio": 0.9289903943678651, "grad/layer_4/attn": 0.001825145911425352, "grad/layer_4/mlp": 0.0025161313824355602, "grad/layer_4/attn_mlp_ratio": 0.7253777968942429, "grad/layer_8/attn": 0.0040276749059557915, "grad/layer_8/mlp": 0.00331823225133121, "grad/layer_8/attn_mlp_ratio": 1.2138013494865623, "grad/layer_12/attn": 0.008781113661825657, "grad/layer_12/mlp": 0.006745329592376947, "grad/layer_12/attn_mlp_ratio": 1.301806444205299, "grad/layer_16/attn": 0.006169721018522978, "grad/layer_16/mlp": 0.004482860676944256, "grad/layer_16/attn_mlp_ratio": 1.3762910171679639, "grad/layer_20/attn": 0.00333422957919538, "grad/layer_20/mlp": 0.005728286225348711, "grad/layer_20/attn_mlp_ratio": 0.5820640571754954, "grad/layer_24/attn": 0.009242105297744274, "grad/layer_24/mlp": 0.009240237064659595, "grad/layer_24/attn_mlp_ratio": 1.000202173716041, "grad/layer_27/attn": 0.006453040987253189, "grad/layer_27/mlp": 0.009825760498642921, "grad/layer_27/attn_mlp_ratio": 0.656747223023574} {"step": 11350, "timestamp": 1778337979.4857793, "train/loss": 2.320940399169922, "train/z_loss": 0.0013609482208266853, "train/perplexity": 10.185248002586746, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024379.8071510943, "perf/iters_per_sec": 0.965299514365718, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035947895050049, "data/tokens_consumed": 23804772352, "data/tokens_consumed_B": 23.804772352, "train/loss_slope": -1.3114332072626841e-05} {"step": 11360, "timestamp": 1778337989.8444998, "train/loss": 2.2971216440200806, "train/z_loss": 0.001365441083908081, "train/perplexity": 9.945514485204658, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025643.5615724649, "perf/iters_per_sec": 0.965902119432671, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353015899658202, "data/tokens_consumed": 23825743872, "data/tokens_consumed_B": 23.825743872, "train/loss_slope": -1.3815712306437276e-05} {"step": 11370, "timestamp": 1778338000.2175808, "train/loss": 2.3652238845825195, "train/z_loss": 0.0013548482907935976, "train/perplexity": 10.646422120479428, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023158.0192544828, "perf/iters_per_sec": 0.9647169204971708, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365735054016114, "data/tokens_consumed": 23846715392, "data/tokens_consumed_B": 23.846715392, "train/loss_slope": -1.1282041485589315e-05} {"step": 11380, "timestamp": 1778338010.5715253, "train/loss": 2.3378684282302857, "train/z_loss": 0.0013564645778387785, "train/perplexity": 10.359131779275799, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026386.8490561412, "perf/iters_per_sec": 0.9662565465241152, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349218368530273, "data/tokens_consumed": 23867686912, "data/tokens_consumed_B": 23.867686912, "train/loss_slope": -1.1568030644350164e-05} {"step": 11390, "timestamp": 1778338020.9408977, "train/loss": 2.3496954917907713, "train/z_loss": 0.001360660302452743, "train/perplexity": 10.48237726875715, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023406.9133645028, "perf/iters_per_sec": 0.9648356024572863, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364459991455077, "data/tokens_consumed": 23888658432, "data/tokens_consumed_B": 23.888658432, "train/loss_slope": -9.621777965111907e-06} {"step": 11400, "timestamp": 1778338031.2982078, "grad/layer_0/attn": 0.003438857151195407, "grad/layer_0/mlp": 0.003415903076529503, "grad/layer_0/attn_mlp_ratio": 1.0067197380838016, "grad/layer_4/attn": 0.002057703211903572, "grad/layer_4/mlp": 0.002561913337558508, "grad/layer_4/attn_mlp_ratio": 0.8031899836024725, "grad/layer_8/attn": 0.00605364516377449, "grad/layer_8/mlp": 0.003478352213278413, "grad/layer_8/attn_mlp_ratio": 1.7403772299502391, "grad/layer_12/attn": 0.006456476636230946, "grad/layer_12/mlp": 0.006754915229976177, "grad/layer_12/attn_mlp_ratio": 0.9558190326352641, "grad/layer_16/attn": 0.004313180223107338, "grad/layer_16/mlp": 0.0046924748457968235, "grad/layer_16/attn_mlp_ratio": 0.9191695795777813, "grad/layer_20/attn": 0.00421209866181016, "grad/layer_20/mlp": 0.0063623287715017796, "grad/layer_20/attn_mlp_ratio": 0.6620372424753211, "grad/layer_24/attn": 0.00971810333430767, "grad/layer_24/mlp": 0.00857860129326582, "grad/layer_24/attn_mlp_ratio": 1.1328307364806993, "grad/layer_27/attn": 0.013850493356585503, "grad/layer_27/mlp": 0.007248803973197937, "grad/layer_27/attn_mlp_ratio": 1.9107280617222029} {"step": 11400, "timestamp": 1778338031.913023, "eos/sharpness": 45.509791374206536, "eos/L0_probe": 2.3120627403259277, "eos/L_plus": 2.496354818344116, "eos/L_minus": 2.5828685760498047, "eos/grad_norm": 0.1435389667749405, "eos/embed_grad_frac": 0.11040617525577545, "eos/time_s": 0.6120057106018066} {"step": 11400, "timestamp": 1778338031.9326155, "train/loss": 2.319528651237488, "train/z_loss": 0.001361555082257837, "train/perplexity": 10.170879144766895, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909175.1897381225, "perf/iters_per_sec": 0.9103656719866383, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0984596967697144, "data/tokens_consumed": 23909629952, "data/tokens_consumed_B": 23.909629952, "train/loss_slope": -9.029659365091182e-06} {"step": 11400, "timestamp": 1778338033.2945445, "geo/rankme_last": 429.8316345214844, "geo/layer_0/stable_rank_q_proj": 20.69553565979004, "geo/layer_0/stable_rank_k_proj": 17.032325744628906, "geo/layer_0/stable_rank_o_proj": 44.150367736816406, "geo/layer_0/stable_rank_gate_proj": 126.29881286621094, "geo/layer_0/stable_rank_down_proj": 57.3673095703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06310991197824478, "geo/layer_0/attn_entropy_mean": 6.23706579208374, "geo/layer_0/attn_entropy_std": 0.45471036434173584, "geo/layer_7/stable_rank_q_proj": 42.00049591064453, "geo/layer_7/stable_rank_k_proj": 38.77859115600586, "geo/layer_7/stable_rank_o_proj": 88.80384826660156, "geo/layer_7/stable_rank_gate_proj": 78.77249908447266, "geo/layer_7/stable_rank_down_proj": 144.77452087402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40313050150871277, "geo/layer_7/attn_entropy_mean": 4.7484450340271, "geo/layer_7/attn_entropy_std": 0.7668089270591736, "geo/layer_14/stable_rank_q_proj": 51.775001525878906, "geo/layer_14/stable_rank_k_proj": 42.82587432861328, "geo/layer_14/stable_rank_o_proj": 42.48876190185547, "geo/layer_14/stable_rank_gate_proj": 72.18820190429688, "geo/layer_14/stable_rank_down_proj": 127.28547668457031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3726012110710144, "geo/layer_14/attn_entropy_mean": 5.536360740661621, "geo/layer_14/attn_entropy_std": 0.4559939503669739, "geo/layer_21/stable_rank_q_proj": 38.49497985839844, "geo/layer_21/stable_rank_k_proj": 28.659584045410156, "geo/layer_21/stable_rank_o_proj": 65.79664611816406, "geo/layer_21/stable_rank_gate_proj": 60.45598220825195, "geo/layer_21/stable_rank_down_proj": 48.907474517822266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13205499947071075, "geo/layer_21/attn_entropy_mean": 5.850072860717773, "geo/layer_21/attn_entropy_std": 0.3238021731376648, "geo/layer_27/stable_rank_q_proj": 44.20608901977539, "geo/layer_27/stable_rank_k_proj": 30.180452346801758, "geo/layer_27/stable_rank_o_proj": 107.67032623291016, "geo/layer_27/stable_rank_gate_proj": 70.405029296875, "geo/layer_27/stable_rank_down_proj": 130.13568115234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09562806040048599, "geo/layer_27/attn_entropy_mean": 4.323554039001465, "geo/layer_27/attn_entropy_std": 0.7001626491546631, "attnres/final_alpha/block_0": 0.26341772079467773, "attnres/block_norm/0": 1.780988097190857, "attnres/final_alpha/block_1": 0.003939882852137089, "attnres/block_norm/1": 50659.875, "attnres/final_alpha/block_2": 0.008645543828606606, "attnres/block_norm/2": 29934.96875, "attnres/final_alpha/block_3": 0.010768785141408443, "attnres/block_norm/3": 71660.0078125, "attnres/final_alpha/block_4": 0.012115079909563065, "attnres/block_norm/4": 17495.623046875, "attnres/final_alpha/block_5": 0.6001202464103699, "attnres/block_norm/5": 7246.8671875, "attnres/final_alpha/block_6": 0.10099269449710846, "attnres/block_norm/6": 48043.23828125, "geo/tier1_time_s": 1.3579492568969727, "geo/step": 11400.0, "geo/rankme_slope": 0.00042375454087885156} {"step": 11410, "timestamp": 1778338043.6574159, "train/loss": 2.343274402618408, "train/z_loss": 0.001369732536841184, "train/perplexity": 10.41528462402457, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789237.4644316027, "perf/iters_per_sec": 0.8531749078901304, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.172092604637146, "data/tokens_consumed": 23930601472, "data/tokens_consumed_B": 23.930601472, "train/loss_slope": -1.03115495389337e-05} {"step": 11420, "timestamp": 1778338054.0302753, "train/loss": 2.373110771179199, "train/z_loss": 0.0013508067349903286, "train/perplexity": 10.73072123621614, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023265.3322046471, "perf/iters_per_sec": 0.9647680912993656, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365185260772705, "data/tokens_consumed": 23951572992, "data/tokens_consumed_B": 23.951572992, "train/loss_slope": -5.6129302915567e-06} {"step": 11430, "timestamp": 1778338064.392863, "train/loss": 2.3213769435882567, "train/z_loss": 0.001345663925167173, "train/perplexity": 10.189695286399468, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025179.1904788667, "perf/iters_per_sec": 0.965680690040048, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355389833450317, "data/tokens_consumed": 23972544512, "data/tokens_consumed_B": 23.972544512, "train/loss_slope": -4.90143172489858e-06} {"step": 11440, "timestamp": 1778338074.7445223, "train/loss": 2.3425611019134522, "train/z_loss": 0.00136019685305655, "train/perplexity": 10.407858043167504, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026840.6587144043, "perf/iters_per_sec": 0.9664729398319265, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346901178359986, "data/tokens_consumed": 23993516032, "data/tokens_consumed_B": 23.993516032, "train/loss_slope": -1.0452264355998114e-06} {"step": 11450, "timestamp": 1778338085.7750902, "grad/layer_0/attn": 0.004019749816507101, "grad/layer_0/mlp": 0.004127055406570435, "grad/layer_0/attn_mlp_ratio": 0.9739994555700794, "grad/layer_4/attn": 0.002402109792456031, "grad/layer_4/mlp": 0.002790661295875907, "grad/layer_4/attn_mlp_ratio": 0.8607671987744249, "grad/layer_8/attn": 0.0038507876452058554, "grad/layer_8/mlp": 0.0038138926029205322, "grad/layer_8/attn_mlp_ratio": 1.0096738280699586, "grad/layer_12/attn": 0.009663998149335384, "grad/layer_12/mlp": 0.007472504861652851, "grad/layer_12/attn_mlp_ratio": 1.2932742365416636, "grad/layer_16/attn": 0.004476596601307392, "grad/layer_16/mlp": 0.005359108094125986, "grad/layer_16/attn_mlp_ratio": 0.8353249158533695, "grad/layer_20/attn": 0.004866324830800295, "grad/layer_20/mlp": 0.007331972010433674, "grad/layer_20/attn_mlp_ratio": 0.6637129489179767, "grad/layer_24/attn": 0.014058411121368408, "grad/layer_24/mlp": 0.014701087027788162, "grad/layer_24/attn_mlp_ratio": 0.9562837767824014, "grad/layer_27/attn": 0.0039034411311149597, "grad/layer_27/mlp": 0.014757012017071247, "grad/layer_27/attn_mlp_ratio": 0.2645143271651428} {"step": 11450, "timestamp": 1778338085.7908702, "train/loss": 2.3022898197174073, "train/z_loss": 0.001361409923993051, "train/perplexity": 9.997047703122252, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1899777.8480401135, "perf/iters_per_sec": 0.905884670276696, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1038932800292969, "data/tokens_consumed": 24014487552, "data/tokens_consumed_B": 24.014487552, "train/loss_slope": -1.1044599542809678e-06} {"step": 11460, "timestamp": 1778338096.1518574, "train/loss": 2.357893681526184, "train/z_loss": 0.0013710619416087866, "train/perplexity": 10.56866701303631, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025628.5875643427, "perf/iters_per_sec": 0.9658949792691911, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353092432022095, "data/tokens_consumed": 24035459072, "data/tokens_consumed_B": 24.035459072, "train/loss_slope": 2.1431730060365598e-07} {"step": 11470, "timestamp": 1778338106.5108972, "train/loss": 2.3497936487197877, "train/z_loss": 0.0013685303158126772, "train/perplexity": 10.483406237218007, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025445.2322287941, "perf/iters_per_sec": 0.9658075486320468, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354029655456543, "data/tokens_consumed": 24056430592, "data/tokens_consumed_B": 24.056430592, "train/loss_slope": 1.0532982171279387e-06} {"step": 11475, "timestamp": 1778338112.285678, "eos/sharpness": 52.44734287261962, "eos/L0_probe": 2.3126327991485596, "eos/L_plus": 2.6061418056488037, "eos/L_minus": 2.5435972213745117, "eos/grad_norm": 0.13620223104953766, "eos/embed_grad_frac": 0.11881963163614273, "eos/time_s": 0.6036932468414307} {"step": 11475, "timestamp": 1778338113.6682642, "geo/rankme_last": 429.0997314453125, "geo/layer_0/stable_rank_q_proj": 20.656496047973633, "geo/layer_0/stable_rank_k_proj": 17.002553939819336, "geo/layer_0/stable_rank_o_proj": 44.123409271240234, "geo/layer_0/stable_rank_gate_proj": 125.98072052001953, "geo/layer_0/stable_rank_down_proj": 57.28187942504883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06539701670408249, "geo/layer_0/attn_entropy_mean": 6.231810569763184, "geo/layer_0/attn_entropy_std": 0.4504237771034241, "geo/layer_7/stable_rank_q_proj": 42.037025451660156, "geo/layer_7/stable_rank_k_proj": 38.7032356262207, "geo/layer_7/stable_rank_o_proj": 88.80184936523438, "geo/layer_7/stable_rank_gate_proj": 78.59429168701172, "geo/layer_7/stable_rank_down_proj": 144.34707641601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4057024121284485, "geo/layer_7/attn_entropy_mean": 4.732205390930176, "geo/layer_7/attn_entropy_std": 0.7518110871315002, "geo/layer_14/stable_rank_q_proj": 51.851295471191406, "geo/layer_14/stable_rank_k_proj": 42.94798278808594, "geo/layer_14/stable_rank_o_proj": 42.43710708618164, "geo/layer_14/stable_rank_gate_proj": 72.20232391357422, "geo/layer_14/stable_rank_down_proj": 127.49755096435547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37519845366477966, "geo/layer_14/attn_entropy_mean": 5.487407684326172, "geo/layer_14/attn_entropy_std": 0.4693961441516876, "geo/layer_21/stable_rank_q_proj": 38.470760345458984, "geo/layer_21/stable_rank_k_proj": 28.628944396972656, "geo/layer_21/stable_rank_o_proj": 65.8639907836914, "geo/layer_21/stable_rank_gate_proj": 60.476234436035156, "geo/layer_21/stable_rank_down_proj": 48.91259765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13209642469882965, "geo/layer_21/attn_entropy_mean": 5.851877689361572, "geo/layer_21/attn_entropy_std": 0.3210930824279785, "geo/layer_27/stable_rank_q_proj": 44.33363723754883, "geo/layer_27/stable_rank_k_proj": 30.2273006439209, "geo/layer_27/stable_rank_o_proj": 107.58946228027344, "geo/layer_27/stable_rank_gate_proj": 70.4687728881836, "geo/layer_27/stable_rank_down_proj": 130.11375427246094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09974585473537445, "geo/layer_27/attn_entropy_mean": 4.311995029449463, "geo/layer_27/attn_entropy_std": 0.6783403158187866, "attnres/final_alpha/block_0": 0.2598772346973419, "attnres/block_norm/0": 1.7810750007629395, "attnres/final_alpha/block_1": 0.003794663352891803, "attnres/block_norm/1": 50671.8046875, "attnres/final_alpha/block_2": 0.008279124274849892, "attnres/block_norm/2": 29990.04296875, "attnres/final_alpha/block_3": 0.010525712743401527, "attnres/block_norm/3": 71808.625, "attnres/final_alpha/block_4": 0.011931892484426498, "attnres/block_norm/4": 17428.3359375, "attnres/final_alpha/block_5": 0.6057121753692627, "attnres/block_norm/5": 7249.2890625, "attnres/final_alpha/block_6": 0.0998792052268982, "attnres/block_norm/6": 48704.87890625, "geo/tier1_time_s": 1.3612637519836426, "geo/step": 11475.0, "geo/rankme_slope": 0.00038174326371173467} {"step": 11480, "timestamp": 1778338118.8489966, "train/loss": 2.323531413078308, "train/z_loss": 0.0013526704628020526, "train/perplexity": 10.211672339951637, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700787.341010501, "perf/iters_per_sec": 0.8109986023952965, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2330477476119994, "data/tokens_consumed": 24077402112, "data/tokens_consumed_B": 24.077402112, "train/loss_slope": 3.2207464549479274e-06} {"step": 11490, "timestamp": 1778338129.2177157, "train/loss": 2.3774508476257323, "train/z_loss": 0.0013442289782688022, "train/perplexity": 10.777394596420788, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023786.841592153, "perf/iters_per_sec": 0.9650167663536802, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036251425743103, "data/tokens_consumed": 24098373632, "data/tokens_consumed_B": 24.098373632, "train/loss_slope": 4.944537045753218e-06} {"step": 11500, "timestamp": 1778338139.580177, "grad/layer_0/attn": 0.002862780587747693, "grad/layer_0/mlp": 0.0032780375331640244, "grad/layer_0/attn_mlp_ratio": 0.8733214526840202, "grad/layer_4/attn": 0.0034931341651827097, "grad/layer_4/mlp": 0.0026853284798562527, "grad/layer_4/attn_mlp_ratio": 1.3008218775855354, "grad/layer_8/attn": 0.004809593316167593, "grad/layer_8/mlp": 0.003326651407405734, "grad/layer_8/attn_mlp_ratio": 1.4457761221638514, "grad/layer_12/attn": 0.007234558463096619, "grad/layer_12/mlp": 0.006939206272363663, "grad/layer_12/attn_mlp_ratio": 1.0425628054397167, "grad/layer_16/attn": 0.003921110183000565, "grad/layer_16/mlp": 0.004636191762983799, "grad/layer_16/attn_mlp_ratio": 0.845760982048095, "grad/layer_20/attn": 0.0026473074685782194, "grad/layer_20/mlp": 0.005995784420520067, "grad/layer_20/attn_mlp_ratio": 0.4415281202181354, "grad/layer_24/attn": 0.012069232761859894, "grad/layer_24/mlp": 0.01116703450679779, "grad/layer_24/attn_mlp_ratio": 1.080791202573412, "grad/layer_27/attn": 0.004019620828330517, "grad/layer_27/mlp": 0.01128205843269825, "grad/layer_27/attn_mlp_ratio": 0.3562843444465957} {"step": 11500, "timestamp": 1778338139.596269, "train/loss": 2.343089723587036, "train/z_loss": 0.001360441860742867, "train/perplexity": 10.413361316951452, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021638.8970536557, "perf/iters_per_sec": 0.9639925465839652, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373524188995362, "data/tokens_consumed": 24119345152, "data/tokens_consumed_B": 24.119345152, "train/loss_slope": 6.978625504419023e-06} {"step": 11500, "timestamp": 1778338146.676293, "geo/ww_alpha_mean": 7.852052203738954, "geo/ww_alpha_std": 5.0316294740187955, "geo/ww_alpha_min": 1.341007643933119, "geo/ww_alpha_max": 33.10070900905656, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.8461888722401723, "geo/ww_alpha_by_type/k_proj": 4.589096035312504, "geo/ww_alpha_by_type/v_proj": 9.217262436416288, "geo/ww_alpha_by_type/o_proj": 8.710291483478288, "geo/ww_alpha_by_type/gate_proj": 7.7483993728166425, "geo/ww_alpha_by_type/up_proj": 13.071539832826854, "geo/ww_alpha_by_type/down_proj": 7.887146535776389, "geo/twonn_id/layer_0": 0.6799457669258118, "geo/twonn_id/layer_7": 3.8743350505828857, "geo/twonn_id/layer_14": 5.552082538604736, "geo/twonn_id/layer_21": 8.809774398803711, "geo/twonn_id/layer_27": 6.312519550323486, "geo/tier2_time_s": 7.0695531368255615} {"step": 11500, "timestamp": 1778338147.4310372, "eoc/jacobian_sigma/layer_0/attn": 1392.4716796875, "eoc/jacobian_sigma/layer_0/mlp": 11601.7060546875, "eoc/jacobian_sigma/layer_0": 11601.7060546875, "eoc/jacobian_sigma/layer_7/attn": 1.1483463048934937, "eoc/jacobian_sigma/layer_7/mlp": 1.8252556324005127, "eoc/jacobian_sigma/layer_7": 1.8252556324005127, "eoc/jacobian_sigma/layer_14/attn": 2.2032768726348877, "eoc/jacobian_sigma/layer_14/mlp": 11.948108673095703, "eoc/jacobian_sigma/layer_14": 11.948108673095703, "eoc/jacobian_sigma/layer_21/attn": 1.0967661142349243, "eoc/jacobian_sigma/layer_21/mlp": 6.579982280731201, "eoc/jacobian_sigma/layer_21": 6.579982280731201, "eoc/jacobian_sigma/layer_27/attn": 4.018568992614746, "eoc/jacobian_sigma/layer_27/mlp": 34.6546745300293, "eoc/jacobian_sigma/layer_27": 34.6546745300293, "eoc/layer0_sigma": 11601.7060546875, "eoc/sigma_max": 34.6546745300293, "eoc/sigma_min": 1.8252556324005127, "eoc/sigma_mean": 13.752005279064178, "eoc/time_s": 0.7478971481323242} {"step": 11510, "timestamp": 1778338157.8237655, "train/loss": 2.348494529724121, "train/z_loss": 0.0013566784909926355, "train/perplexity": 10.46979588768195, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1151095.6934764567, "perf/iters_per_sec": 0.5488851992971691, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8218745946884156, "data/tokens_consumed": 24140316672, "data/tokens_consumed_B": 24.140316672, "train/loss_slope": 8.045166775588694e-06} {"step": 11520, "timestamp": 1778338168.5960927, "train/loss": 2.316715049743652, "train/z_loss": 0.0013614890165627002, "train/perplexity": 10.14230256441799, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947912.2829137722, "perf/iters_per_sec": 0.9288369574135648, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0766152143478394, "data/tokens_consumed": 24161288192, "data/tokens_consumed_B": 24.161288192, "train/loss_slope": 5.449801824703038e-06} {"step": 11530, "timestamp": 1778338178.9689212, "train/loss": 2.3682718753814695, "train/z_loss": 0.0013547388836741448, "train/perplexity": 10.67892182137819, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023468.3551409815, "perf/iters_per_sec": 0.9648649001793773, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364145278930663, "data/tokens_consumed": 24182259712, "data/tokens_consumed_B": 24.182259712, "train/loss_slope": 9.132208897121076e-06} {"step": 11540, "timestamp": 1778338189.3301642, "train/loss": 2.3174596309661863, "train/z_loss": 0.0013467133045196532, "train/perplexity": 10.149857144610971, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025476.52762295, "perf/iters_per_sec": 0.9658224714388609, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353869676589966, "data/tokens_consumed": 24203231232, "data/tokens_consumed_B": 24.203231232, "train/loss_slope": 7.300297805506835e-06} {"step": 11550, "timestamp": 1778338199.6843, "grad/layer_0/attn": 0.0036740340292453766, "grad/layer_0/mlp": 0.0037813433445990086, "grad/layer_0/attn_mlp_ratio": 0.9716213517957738, "grad/layer_4/attn": 0.002505525015294552, "grad/layer_4/mlp": 0.0026280072052031755, "grad/layer_4/attn_mlp_ratio": 0.9533934743384762, "grad/layer_8/attn": 0.00403855973854661, "grad/layer_8/mlp": 0.0034633262548595667, "grad/layer_8/attn_mlp_ratio": 1.166092745744248, "grad/layer_12/attn": 0.00902489759027958, "grad/layer_12/mlp": 0.006890636868774891, "grad/layer_12/attn_mlp_ratio": 1.3097334297505656, "grad/layer_16/attn": 0.0034650869201868773, "grad/layer_16/mlp": 0.005157919134944677, "grad/layer_16/attn_mlp_ratio": 0.6717993753587813, "grad/layer_20/attn": 0.0034315690863877535, "grad/layer_20/mlp": 0.006195788737386465, "grad/layer_20/attn_mlp_ratio": 0.553855074220908, "grad/layer_24/attn": 0.008824666030704975, "grad/layer_24/mlp": 0.0091491574421525, "grad/layer_24/attn_mlp_ratio": 0.9645331813391004, "grad/layer_27/attn": 0.008182007819414139, "grad/layer_27/mlp": 0.008008129894733429, "grad/layer_27/attn_mlp_ratio": 1.0217126626060093} {"step": 11550, "timestamp": 1778338200.3036356, "eos/sharpness": 54.56681251525878, "eos/L0_probe": 2.3158490657806396, "eos/L_plus": 2.53072452545166, "eos/L_minus": 2.646641731262207, "eos/grad_norm": 0.15999123454093933, "eos/embed_grad_frac": 0.09860459715127945, "eos/time_s": 0.6165218353271484} {"step": 11550, "timestamp": 1778338200.324383, "train/loss": 2.312446880340576, "train/z_loss": 0.0013530768337659538, "train/perplexity": 10.09910575017355, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908646.5008176868, "perf/iters_per_sec": 0.9101135734642443, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0987639665603637, "data/tokens_consumed": 24224202752, "data/tokens_consumed_B": 24.224202752, "train/loss_slope": 5.7243749611090495e-06} {"step": 11550, "timestamp": 1778338201.6945305, "geo/rankme_last": 429.8787841796875, "geo/layer_0/stable_rank_q_proj": 20.643163681030273, "geo/layer_0/stable_rank_k_proj": 16.987607955932617, "geo/layer_0/stable_rank_o_proj": 44.108856201171875, "geo/layer_0/stable_rank_gate_proj": 125.94496154785156, "geo/layer_0/stable_rank_down_proj": 57.19068145751953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06579860299825668, "geo/layer_0/attn_entropy_mean": 6.236919403076172, "geo/layer_0/attn_entropy_std": 0.44921913743019104, "geo/layer_7/stable_rank_q_proj": 42.079246520996094, "geo/layer_7/stable_rank_k_proj": 38.80841064453125, "geo/layer_7/stable_rank_o_proj": 88.7194595336914, "geo/layer_7/stable_rank_gate_proj": 78.52339935302734, "geo/layer_7/stable_rank_down_proj": 144.3983917236328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4127257466316223, "geo/layer_7/attn_entropy_mean": 4.732785701751709, "geo/layer_7/attn_entropy_std": 0.7720977067947388, "geo/layer_14/stable_rank_q_proj": 51.84001922607422, "geo/layer_14/stable_rank_k_proj": 42.96955490112305, "geo/layer_14/stable_rank_o_proj": 42.40299606323242, "geo/layer_14/stable_rank_gate_proj": 72.13946533203125, "geo/layer_14/stable_rank_down_proj": 127.31682586669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37824928760528564, "geo/layer_14/attn_entropy_mean": 5.504158973693848, "geo/layer_14/attn_entropy_std": 0.4707064926624298, "geo/layer_21/stable_rank_q_proj": 38.45058059692383, "geo/layer_21/stable_rank_k_proj": 28.60353660583496, "geo/layer_21/stable_rank_o_proj": 65.7591781616211, "geo/layer_21/stable_rank_gate_proj": 60.5068359375, "geo/layer_21/stable_rank_down_proj": 48.91880416870117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13882727921009064, "geo/layer_21/attn_entropy_mean": 5.847433090209961, "geo/layer_21/attn_entropy_std": 0.32771748304367065, "geo/layer_27/stable_rank_q_proj": 44.374595642089844, "geo/layer_27/stable_rank_k_proj": 30.278512954711914, "geo/layer_27/stable_rank_o_proj": 107.47251892089844, "geo/layer_27/stable_rank_gate_proj": 70.47057342529297, "geo/layer_27/stable_rank_down_proj": 130.3960723876953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09651559591293335, "geo/layer_27/attn_entropy_mean": 4.315381050109863, "geo/layer_27/attn_entropy_std": 0.6561019420623779, "attnres/final_alpha/block_0": 0.2618129849433899, "attnres/block_norm/0": 1.7809555530548096, "attnres/final_alpha/block_1": 0.0038487925194203854, "attnres/block_norm/1": 50747.48828125, "attnres/final_alpha/block_2": 0.00847549457103014, "attnres/block_norm/2": 29907.33984375, "attnres/final_alpha/block_3": 0.010589745827019215, "attnres/block_norm/3": 71752.484375, "attnres/final_alpha/block_4": 0.012028791941702366, "attnres/block_norm/4": 17465.771484375, "attnres/final_alpha/block_5": 0.6026746034622192, "attnres/block_norm/5": 7201.7294921875, "attnres/final_alpha/block_6": 0.10056960582733154, "attnres/block_norm/6": 48294.8984375, "geo/tier1_time_s": 1.3663804531097412, "geo/step": 11550.0, "geo/rankme_slope": 0.00039520038484143656} {"step": 11560, "timestamp": 1778338212.0479424, "train/loss": 2.30548894405365, "train/z_loss": 0.0013751505757682025, "train/perplexity": 10.029080713191053, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789399.5112989233, "perf/iters_per_sec": 0.8532521778578392, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.17198646068573, "data/tokens_consumed": 24245174272, "data/tokens_consumed_B": 24.245174272, "train/loss_slope": 3.5237323046803243e-06} {"step": 11570, "timestamp": 1778338222.4044945, "train/loss": 2.334827423095703, "train/z_loss": 0.0013612740905955434, "train/perplexity": 10.327677456963071, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025928.1097956665, "perf/iters_per_sec": 0.9660378025987942, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351561784744263, "data/tokens_consumed": 24266145792, "data/tokens_consumed_B": 24.266145792, "train/loss_slope": 3.0822764636158785e-06} {"step": 11580, "timestamp": 1778338232.7767835, "train/loss": 2.3721869945526124, "train/z_loss": 0.001370311062783003, "train/perplexity": 10.72081302394377, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023752.1994113834, "perf/iters_per_sec": 0.965000247674648, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362691640853883, "data/tokens_consumed": 24287117312, "data/tokens_consumed_B": 24.287117312, "train/loss_slope": 2.4401214936099036e-06} {"step": 11590, "timestamp": 1778338243.137726, "train/loss": 2.3447553157806396, "train/z_loss": 0.0013587223016656934, "train/perplexity": 10.430720182652664, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025741.7610994494, "perf/iters_per_sec": 0.9659489446160552, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352514028549193, "data/tokens_consumed": 24308088832, "data/tokens_consumed_B": 24.308088832, "train/loss_slope": 2.2584992988739583e-06} {"step": 11600, "timestamp": 1778338253.4829156, "grad/layer_0/attn": 0.0032185467425733805, "grad/layer_0/mlp": 0.003457313170656562, "grad/layer_0/attn_mlp_ratio": 0.9309387060438875, "grad/layer_4/attn": 0.002385372295975685, "grad/layer_4/mlp": 0.0025806112680584192, "grad/layer_4/attn_mlp_ratio": 0.9243438688601815, "grad/layer_8/attn": 0.0031739857513457537, "grad/layer_8/mlp": 0.003596074413508177, "grad/layer_8/attn_mlp_ratio": 0.8826251345524397, "grad/layer_12/attn": 0.004680385813117027, "grad/layer_12/mlp": 0.006426623091101646, "grad/layer_12/attn_mlp_ratio": 0.7282807275207184, "grad/layer_16/attn": 0.007464416325092316, "grad/layer_16/mlp": 0.004628610331565142, "grad/layer_16/attn_mlp_ratio": 1.6126689500996223, "grad/layer_20/attn": 0.003067415440455079, "grad/layer_20/mlp": 0.0058494615368545055, "grad/layer_20/attn_mlp_ratio": 0.5243927784958606, "grad/layer_24/attn": 0.011740284971892834, "grad/layer_24/mlp": 0.009477474726736546, "grad/layer_24/attn_mlp_ratio": 1.238756650534461, "grad/layer_27/attn": 0.006247072946280241, "grad/layer_27/mlp": 0.008868996985256672, "grad/layer_27/attn_mlp_ratio": 0.7043719697083933} {"step": 11600, "timestamp": 1778338253.498933, "train/loss": 2.3250694990158083, "train/z_loss": 0.0013614078401587904, "train/perplexity": 10.227390854690531, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025139.5117135446, "perf/iters_per_sec": 0.9656617697303508, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355592727661134, "data/tokens_consumed": 24329060352, "data/tokens_consumed_B": 24.329060352, "train/loss_slope": 3.8456867308434816e-06} {"step": 11610, "timestamp": 1778338263.8720498, "train/loss": 2.34710373878479, "train/z_loss": 0.0013529760995879768, "train/perplexity": 10.455244711592176, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023349.943414889, "perf/iters_per_sec": 0.9648084370684095, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364751815795898, "data/tokens_consumed": 24350031872, "data/tokens_consumed_B": 24.350031872, "train/loss_slope": 4.768414630426345e-06} {"step": 11620, "timestamp": 1778338274.2432065, "train/loss": 2.3500919580459594, "train/z_loss": 0.0013665998238138854, "train/perplexity": 10.486534001566058, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023846.9560402818, "perf/iters_per_sec": 0.9650454311562928, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036220645904541, "data/tokens_consumed": 24371003392, "data/tokens_consumed_B": 24.371003392, "train/loss_slope": 5.7208864387720365e-06} {"step": 11625, "timestamp": 1778338280.0222642, "eos/sharpness": 31.296563148498528, "eos/L0_probe": 2.309047222137451, "eos/L_plus": 2.472487449645996, "eos/L_minus": 2.4585726261138916, "eos/grad_norm": 0.13250547647476196, "eos/embed_grad_frac": 0.14776603877544403, "eos/time_s": 0.6136903762817383} {"step": 11625, "timestamp": 1778338281.4006798, "geo/rankme_last": 429.2699890136719, "geo/layer_0/stable_rank_q_proj": 20.658611297607422, "geo/layer_0/stable_rank_k_proj": 16.96184730529785, "geo/layer_0/stable_rank_o_proj": 44.12602233886719, "geo/layer_0/stable_rank_gate_proj": 125.93199157714844, "geo/layer_0/stable_rank_down_proj": 57.13943099975586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06586192548274994, "geo/layer_0/attn_entropy_mean": 6.237133026123047, "geo/layer_0/attn_entropy_std": 0.45149940252304077, "geo/layer_7/stable_rank_q_proj": 42.15434265136719, "geo/layer_7/stable_rank_k_proj": 38.799007415771484, "geo/layer_7/stable_rank_o_proj": 88.77298736572266, "geo/layer_7/stable_rank_gate_proj": 78.5339584350586, "geo/layer_7/stable_rank_down_proj": 144.37081909179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3948591649532318, "geo/layer_7/attn_entropy_mean": 4.728306770324707, "geo/layer_7/attn_entropy_std": 0.7616100311279297, "geo/layer_14/stable_rank_q_proj": 51.91586685180664, "geo/layer_14/stable_rank_k_proj": 42.8328971862793, "geo/layer_14/stable_rank_o_proj": 42.34199905395508, "geo/layer_14/stable_rank_gate_proj": 72.21046447753906, "geo/layer_14/stable_rank_down_proj": 127.18907928466797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37194153666496277, "geo/layer_14/attn_entropy_mean": 5.540549278259277, "geo/layer_14/attn_entropy_std": 0.49688518047332764, "geo/layer_21/stable_rank_q_proj": 38.41936492919922, "geo/layer_21/stable_rank_k_proj": 28.591691970825195, "geo/layer_21/stable_rank_o_proj": 65.71585083007812, "geo/layer_21/stable_rank_gate_proj": 60.538978576660156, "geo/layer_21/stable_rank_down_proj": 48.939659118652344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13391759991645813, "geo/layer_21/attn_entropy_mean": 5.852351665496826, "geo/layer_21/attn_entropy_std": 0.32172879576683044, "geo/layer_27/stable_rank_q_proj": 44.34720993041992, "geo/layer_27/stable_rank_k_proj": 30.24274444580078, "geo/layer_27/stable_rank_o_proj": 107.45783996582031, "geo/layer_27/stable_rank_gate_proj": 70.5206527709961, "geo/layer_27/stable_rank_down_proj": 130.2342529296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09728193283081055, "geo/layer_27/attn_entropy_mean": 4.31048583984375, "geo/layer_27/attn_entropy_std": 0.6843352317810059, "attnres/final_alpha/block_0": 0.26096218824386597, "attnres/block_norm/0": 1.7808008193969727, "attnres/final_alpha/block_1": 0.0038136665243655443, "attnres/block_norm/1": 50907.91796875, "attnres/final_alpha/block_2": 0.008524816483259201, "attnres/block_norm/2": 29952.697265625, "attnres/final_alpha/block_3": 0.010635435581207275, "attnres/block_norm/3": 72528.484375, "attnres/final_alpha/block_4": 0.011974301189184189, "attnres/block_norm/4": 17357.81640625, "attnres/final_alpha/block_5": 0.6061333417892456, "attnres/block_norm/5": 7212.419921875, "attnres/final_alpha/block_6": 0.09795629978179932, "attnres/block_norm/6": 48808.04296875, "geo/tier1_time_s": 1.3586599826812744, "geo/step": 11625.0, "geo/rankme_slope": 0.00038343241593512403} {"step": 11630, "timestamp": 1778338286.5843387, "train/loss": 2.3396265268325807, "train/z_loss": 0.0013649988686665893, "train/perplexity": 10.3773601733398, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700061.2993966856, "perf/iters_per_sec": 0.8106523987754276, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.233574342727661, "data/tokens_consumed": 24391974912, "data/tokens_consumed_B": 24.391974912, "train/loss_slope": 7.165512722460583e-06} {"step": 11640, "timestamp": 1778338296.9438927, "train/loss": 2.3220932483673096, "train/z_loss": 0.0013592260307632386, "train/perplexity": 10.19699682858279, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025833.5781054571, "perf/iters_per_sec": 0.9659927263762746, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352044820785522, "data/tokens_consumed": 24412946432, "data/tokens_consumed_B": 24.412946432, "train/loss_slope": 7.170675997615301e-06} {"step": 11650, "timestamp": 1778338307.2874346, "grad/layer_0/attn": 0.0027947458438575268, "grad/layer_0/mlp": 0.003208800218999386, "grad/layer_0/attn_mlp_ratio": 0.8709628415672261, "grad/layer_4/attn": 0.0024650415871292353, "grad/layer_4/mlp": 0.0026860819198191166, "grad/layer_4/attn_mlp_ratio": 0.9177089787061802, "grad/layer_8/attn": 0.0032517448998987675, "grad/layer_8/mlp": 0.0034323101863265038, "grad/layer_8/attn_mlp_ratio": 0.9473924641524791, "grad/layer_12/attn": 0.005421609617769718, "grad/layer_12/mlp": 0.006558937486261129, "grad/layer_12/attn_mlp_ratio": 0.8265987511645563, "grad/layer_16/attn": 0.003240448422729969, "grad/layer_16/mlp": 0.0042067626491189, "grad/layer_16/attn_mlp_ratio": 0.770295026361702, "grad/layer_20/attn": 0.0046143378131091595, "grad/layer_20/mlp": 0.005802982021123171, "grad/layer_20/attn_mlp_ratio": 0.7951666430804119, "grad/layer_24/attn": 0.006068812217563391, "grad/layer_24/mlp": 0.008239411748945713, "grad/layer_24/attn_mlp_ratio": 0.7365589108572034, "grad/layer_27/attn": 0.004806330427527428, "grad/layer_27/mlp": 0.007421300746500492, "grad/layer_27/attn_mlp_ratio": 0.6476398851009859} {"step": 11650, "timestamp": 1778338307.3038294, "train/loss": 2.3459712743759153, "train/z_loss": 0.0013594895135611296, "train/perplexity": 10.443411220838458, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025685.4526527748, "perf/iters_per_sec": 0.96592209465636, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035280179977417, "data/tokens_consumed": 24433917952, "data/tokens_consumed_B": 24.433917952, "train/loss_slope": 7.911178853252102e-06} {"step": 11660, "timestamp": 1778338317.6572654, "train/loss": 2.3447811365127564, "train/z_loss": 0.0013594920746982098, "train/perplexity": 10.43098951496145, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026632.1957459007, "perf/iters_per_sec": 0.9663735369424347, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347965478897094, "data/tokens_consumed": 24454889472, "data/tokens_consumed_B": 24.454889472, "train/loss_slope": 8.340871022908529e-06} {"step": 11670, "timestamp": 1778338328.567074, "train/loss": 2.3413386583328246, "train/z_loss": 0.0013631758280098437, "train/perplexity": 10.395142797333312, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1923635.5851779277, "perf/iters_per_sec": 0.9172609258546485, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0902023315429688, "data/tokens_consumed": 24475860992, "data/tokens_consumed_B": 24.475860992, "train/loss_slope": 9.786952382410141e-06} {"step": 11680, "timestamp": 1778338338.9325197, "train/loss": 2.350165772438049, "train/z_loss": 0.0013601187500171363, "train/perplexity": 10.487308087267493, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024795.7563455245, "perf/iters_per_sec": 0.965497854397547, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357350826263427, "data/tokens_consumed": 24496832512, "data/tokens_consumed_B": 24.496832512, "train/loss_slope": 1.4097390988908114e-05} {"step": 11690, "timestamp": 1778338349.287439, "train/loss": 2.3524839639663697, "train/z_loss": 0.0013571984018199146, "train/perplexity": 10.511647877277595, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026271.269566695, "perf/iters_per_sec": 0.9662014339288211, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034980869293213, "data/tokens_consumed": 24517804032, "data/tokens_consumed_B": 24.517804032, "train/loss_slope": 1.6248760114658903e-05} {"step": 11700, "timestamp": 1778338359.644155, "grad/layer_0/attn": 0.0031945127993822098, "grad/layer_0/mlp": 0.003364500356838107, "grad/layer_0/attn_mlp_ratio": 0.9494761080770818, "grad/layer_4/attn": 0.0023546733427792788, "grad/layer_4/mlp": 0.0026572244241833687, "grad/layer_4/attn_mlp_ratio": 0.8861401516316836, "grad/layer_8/attn": 0.005234390031546354, "grad/layer_8/mlp": 0.003705367911607027, "grad/layer_8/attn_mlp_ratio": 1.4126505154547893, "grad/layer_12/attn": 0.0075047919526696205, "grad/layer_12/mlp": 0.007317786570638418, "grad/layer_12/attn_mlp_ratio": 1.0255548966440269, "grad/layer_16/attn": 0.00448954151943326, "grad/layer_16/mlp": 0.004834093619138002, "grad/layer_16/attn_mlp_ratio": 0.9287245511313378, "grad/layer_20/attn": 0.002733581932261586, "grad/layer_20/mlp": 0.005795605480670929, "grad/layer_20/attn_mlp_ratio": 0.4716645903886946, "grad/layer_24/attn": 0.0070533412508666515, "grad/layer_24/mlp": 0.009168867953121662, "grad/layer_24/attn_mlp_ratio": 0.7692706678732549, "grad/layer_27/attn": 0.010437671095132828, "grad/layer_27/mlp": 0.007939113304018974, "grad/layer_27/attn_mlp_ratio": 1.3147149516530425} {"step": 11700, "timestamp": 1778338360.261417, "eos/sharpness": 8.556842803955076, "eos/L0_probe": 2.316495656967163, "eos/L_plus": 2.359954357147217, "eos/L_minus": 2.35860538482666, "eos/grad_norm": 0.09875450283288956, "eos/embed_grad_frac": 0.2624063491821289, "eos/time_s": 0.6143438816070557} {"step": 11700, "timestamp": 1778338360.2826245, "train/loss": 2.2991803646087647, "train/z_loss": 0.0013751184102147817, "train/perplexity": 9.966010611299684, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908893.4509974155, "perf/iters_per_sec": 0.9102313284861638, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0986218214035035, "data/tokens_consumed": 24538775552, "data/tokens_consumed_B": 24.538775552, "train/loss_slope": 1.3545561151535974e-05} {"step": 11700, "timestamp": 1778338361.6511452, "geo/rankme_last": 428.5133361816406, "geo/layer_0/stable_rank_q_proj": 20.655914306640625, "geo/layer_0/stable_rank_k_proj": 16.948183059692383, "geo/layer_0/stable_rank_o_proj": 44.1548957824707, "geo/layer_0/stable_rank_gate_proj": 126.28143310546875, "geo/layer_0/stable_rank_down_proj": 57.19142532348633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06811350584030151, "geo/layer_0/attn_entropy_mean": 6.2384033203125, "geo/layer_0/attn_entropy_std": 0.4535342752933502, "geo/layer_7/stable_rank_q_proj": 42.093910217285156, "geo/layer_7/stable_rank_k_proj": 38.85158157348633, "geo/layer_7/stable_rank_o_proj": 88.83010864257812, "geo/layer_7/stable_rank_gate_proj": 78.64805603027344, "geo/layer_7/stable_rank_down_proj": 144.44314575195312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3928476572036743, "geo/layer_7/attn_entropy_mean": 4.747992992401123, "geo/layer_7/attn_entropy_std": 0.7640926241874695, "geo/layer_14/stable_rank_q_proj": 51.86751937866211, "geo/layer_14/stable_rank_k_proj": 42.93522644042969, "geo/layer_14/stable_rank_o_proj": 42.24734115600586, "geo/layer_14/stable_rank_gate_proj": 72.24247741699219, "geo/layer_14/stable_rank_down_proj": 127.34984588623047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39686721563339233, "geo/layer_14/attn_entropy_mean": 5.534733295440674, "geo/layer_14/attn_entropy_std": 0.4698226749897003, "geo/layer_21/stable_rank_q_proj": 38.45731735229492, "geo/layer_21/stable_rank_k_proj": 28.554601669311523, "geo/layer_21/stable_rank_o_proj": 65.7528076171875, "geo/layer_21/stable_rank_gate_proj": 60.50178527832031, "geo/layer_21/stable_rank_down_proj": 48.99497985839844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1360228806734085, "geo/layer_21/attn_entropy_mean": 5.853395462036133, "geo/layer_21/attn_entropy_std": 0.33089974522590637, "geo/layer_27/stable_rank_q_proj": 44.43928146362305, "geo/layer_27/stable_rank_k_proj": 30.202369689941406, "geo/layer_27/stable_rank_o_proj": 107.38214874267578, "geo/layer_27/stable_rank_gate_proj": 70.40349578857422, "geo/layer_27/stable_rank_down_proj": 130.0408477783203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0967063456773758, "geo/layer_27/attn_entropy_mean": 4.301591873168945, "geo/layer_27/attn_entropy_std": 0.6888087391853333, "attnres/final_alpha/block_0": 0.2594432532787323, "attnres/block_norm/0": 1.7809679508209229, "attnres/final_alpha/block_1": 0.003819022560492158, "attnres/block_norm/1": 50751.6015625, "attnres/final_alpha/block_2": 0.0082938177511096, "attnres/block_norm/2": 30048.48046875, "attnres/final_alpha/block_3": 0.010358653962612152, "attnres/block_norm/3": 72902.546875, "attnres/final_alpha/block_4": 0.011718884110450745, "attnres/block_norm/4": 17473.73828125, "attnres/final_alpha/block_5": 0.6088003516197205, "attnres/block_norm/5": 7166.2685546875, "attnres/final_alpha/block_6": 0.09756602346897125, "attnres/block_norm/6": 49020.55859375, "geo/tier1_time_s": 1.3648250102996826, "geo/step": 11700.0, "geo/rankme_slope": 0.00034859555150185074} {"step": 11710, "timestamp": 1778338372.0127964, "train/loss": 2.379217505455017, "train/z_loss": 0.0013501192210242153, "train/perplexity": 10.79645139342718, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788349.1307330586, "perf/iters_per_sec": 0.8527513173737805, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.172674822807312, "data/tokens_consumed": 24559747072, "data/tokens_consumed_B": 24.559747072, "train/loss_slope": 1.3281893594251373e-05} {"step": 11720, "timestamp": 1778338382.3755808, "train/loss": 2.3327205181121826, "train/z_loss": 0.0013689203886315226, "train/perplexity": 10.305940928301876, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024815.5188838753, "perf/iters_per_sec": 0.9655072779101731, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035724973678589, "data/tokens_consumed": 24580718592, "data/tokens_consumed_B": 24.580718592, "train/loss_slope": 1.2941743030656814e-05} {"step": 11730, "timestamp": 1778338393.2216363, "train/loss": 2.3721957206726074, "train/z_loss": 0.0013575419434346259, "train/perplexity": 10.72090657545283, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934777.1060061436, "perf/iters_per_sec": 0.9225736169844359, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0839243412017823, "data/tokens_consumed": 24601690112, "data/tokens_consumed_B": 24.601690112, "train/loss_slope": 1.3934923169708003e-05} {"step": 11740, "timestamp": 1778338403.5793471, "train/loss": 2.361217665672302, "train/z_loss": 0.0013599597383290528, "train/perplexity": 10.603855545295463, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025971.645865744, "perf/iters_per_sec": 0.9660585622147293, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351339340209962, "data/tokens_consumed": 24622661632, "data/tokens_consumed_B": 24.622661632, "train/loss_slope": 1.4039608626046592e-05} {"step": 11750, "timestamp": 1778338413.9270296, "grad/layer_0/attn": 0.003616119036450982, "grad/layer_0/mlp": 0.0032885982654988766, "grad/layer_0/attn_mlp_ratio": 1.0995927852997096, "grad/layer_4/attn": 0.002081838669255376, "grad/layer_4/mlp": 0.002588108880445361, "grad/layer_4/attn_mlp_ratio": 0.8043860150344726, "grad/layer_8/attn": 0.004028715193271637, "grad/layer_8/mlp": 0.0034868319053202868, "grad/layer_8/attn_mlp_ratio": 1.1554084587741917, "grad/layer_12/attn": 0.006242531351745129, "grad/layer_12/mlp": 0.007697303779423237, "grad/layer_12/attn_mlp_ratio": 0.8110023262083925, "grad/layer_16/attn": 0.0048935189843177795, "grad/layer_16/mlp": 0.004922812804579735, "grad/layer_16/attn_mlp_ratio": 0.9940493533210042, "grad/layer_20/attn": 0.004776407964527607, "grad/layer_20/mlp": 0.006144978571683168, "grad/layer_20/attn_mlp_ratio": 0.777286337304618, "grad/layer_24/attn": 0.009374158456921577, "grad/layer_24/mlp": 0.010088716633617878, "grad/layer_24/attn_mlp_ratio": 0.9291725305047737, "grad/layer_27/attn": 0.006738219875842333, "grad/layer_27/mlp": 0.00983445718884468, "grad/layer_27/attn_mlp_ratio": 0.6851643845650294} {"step": 11750, "timestamp": 1778338413.9428873, "train/loss": 2.375109601020813, "train/z_loss": 0.0013469672878272832, "train/perplexity": 10.752191572671329, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025270.956493629, "perf/iters_per_sec": 0.9657244474857468, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354920625686646, "data/tokens_consumed": 24643633152, "data/tokens_consumed_B": 24.643633152, "train/loss_slope": 1.8386496991106845e-05} {"step": 11760, "timestamp": 1778338424.294511, "train/loss": 2.3373017072677613, "train/z_loss": 0.001348191185388714, "train/perplexity": 10.35326270536362, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026952.800015338, "perf/iters_per_sec": 0.9665264129711809, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346328735351562, "data/tokens_consumed": 24664604672, "data/tokens_consumed_B": 24.664604672, "train/loss_slope": 1.9312226260849308e-05} {"step": 11770, "timestamp": 1778338434.669525, "train/loss": 2.31487832069397, "train/z_loss": 0.0013511538505554198, "train/perplexity": 10.123691000099967, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023163.417205558, "perf/iters_per_sec": 0.9647194944408216, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365707397460937, "data/tokens_consumed": 24685576192, "data/tokens_consumed_B": 24.685576192, "train/loss_slope": 1.7743140614644324e-05} {"step": 11775, "timestamp": 1778338440.454061, "eos/sharpness": 16.47672653198242, "eos/L0_probe": 2.3132052421569824, "eos/L_plus": 2.3819875717163086, "eos/L_minus": 2.4091901779174805, "eos/grad_norm": 0.09684759378433228, "eos/embed_grad_frac": 0.2525470554828644, "eos/time_s": 0.6111955642700195} {"step": 11775, "timestamp": 1778338441.8343332, "geo/rankme_last": 428.67193603515625, "geo/layer_0/stable_rank_q_proj": 20.657682418823242, "geo/layer_0/stable_rank_k_proj": 16.903972625732422, "geo/layer_0/stable_rank_o_proj": 44.16746139526367, "geo/layer_0/stable_rank_gate_proj": 126.2845458984375, "geo/layer_0/stable_rank_down_proj": 57.238616943359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06157812848687172, "geo/layer_0/attn_entropy_mean": 6.237152099609375, "geo/layer_0/attn_entropy_std": 0.4589632749557495, "geo/layer_7/stable_rank_q_proj": 42.0693473815918, "geo/layer_7/stable_rank_k_proj": 38.82747268676758, "geo/layer_7/stable_rank_o_proj": 88.87809753417969, "geo/layer_7/stable_rank_gate_proj": 78.67943572998047, "geo/layer_7/stable_rank_down_proj": 144.23052978515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40958818793296814, "geo/layer_7/attn_entropy_mean": 4.723336219787598, "geo/layer_7/attn_entropy_std": 0.7740634679794312, "geo/layer_14/stable_rank_q_proj": 51.920135498046875, "geo/layer_14/stable_rank_k_proj": 43.00637435913086, "geo/layer_14/stable_rank_o_proj": 42.24012756347656, "geo/layer_14/stable_rank_gate_proj": 72.25951385498047, "geo/layer_14/stable_rank_down_proj": 127.24088287353516, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3724230229854584, "geo/layer_14/attn_entropy_mean": 5.530364513397217, "geo/layer_14/attn_entropy_std": 0.4756929576396942, "geo/layer_21/stable_rank_q_proj": 38.38837432861328, "geo/layer_21/stable_rank_k_proj": 28.598342895507812, "geo/layer_21/stable_rank_o_proj": 65.5533218383789, "geo/layer_21/stable_rank_gate_proj": 60.47212600708008, "geo/layer_21/stable_rank_down_proj": 49.0218505859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13801072537899017, "geo/layer_21/attn_entropy_mean": 5.855547904968262, "geo/layer_21/attn_entropy_std": 0.3315270245075226, "geo/layer_27/stable_rank_q_proj": 44.47959899902344, "geo/layer_27/stable_rank_k_proj": 30.274452209472656, "geo/layer_27/stable_rank_o_proj": 107.31158447265625, "geo/layer_27/stable_rank_gate_proj": 70.3691635131836, "geo/layer_27/stable_rank_down_proj": 129.9704132080078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11359073966741562, "geo/layer_27/attn_entropy_mean": 4.2888641357421875, "geo/layer_27/attn_entropy_std": 0.696965754032135, "attnres/final_alpha/block_0": 0.26127171516418457, "attnres/block_norm/0": 1.7809745073318481, "attnres/final_alpha/block_1": 0.0038175429217517376, "attnres/block_norm/1": 50626.15625, "attnres/final_alpha/block_2": 0.008409107103943825, "attnres/block_norm/2": 29836.42578125, "attnres/final_alpha/block_3": 0.010598940774798393, "attnres/block_norm/3": 72067.3125, "attnres/final_alpha/block_4": 0.011779431253671646, "attnres/block_norm/4": 17455.35546875, "attnres/final_alpha/block_5": 0.6042216420173645, "attnres/block_norm/5": 7233.3798828125, "attnres/final_alpha/block_6": 0.09990163147449493, "attnres/block_norm/6": 48582.46875, "geo/tier1_time_s": 1.3607077598571777, "geo/step": 11775.0, "geo/rankme_slope": 0.00030344723827030814} {"step": 11780, "timestamp": 1778338447.023269, "train/loss": 2.3283533573150637, "train/z_loss": 0.0013570695300586522, "train/perplexity": 10.261031362127376, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698951.7567330778, "perf/iters_per_sec": 0.8101233276048078, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2343799591064453, "data/tokens_consumed": 24706547712, "data/tokens_consumed_B": 24.706547712, "train/loss_slope": 1.7461609504189244e-05} {"step": 11790, "timestamp": 1778338457.3947794, "train/loss": 2.2842522859573364, "train/z_loss": 0.0013774816412478685, "train/perplexity": 9.818342166418518, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023391.1346013593, "perf/iters_per_sec": 0.9648280785567089, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364540815353394, "data/tokens_consumed": 24727519232, "data/tokens_consumed_B": 24.727519232, "train/loss_slope": 1.564313838190765e-05} {"step": 11800, "timestamp": 1778338467.7516093, "grad/layer_0/attn": 0.002764755394309759, "grad/layer_0/mlp": 0.003182274056598544, "grad/layer_0/attn_mlp_ratio": 0.8687986195585798, "grad/layer_4/attn": 0.002517870394513011, "grad/layer_4/mlp": 0.0025282511487603188, "grad/layer_4/attn_mlp_ratio": 0.9958940575022368, "grad/layer_8/attn": 0.004151872359216213, "grad/layer_8/mlp": 0.003436593571677804, "grad/layer_8/attn_mlp_ratio": 1.2081359496856658, "grad/layer_12/attn": 0.007816992700099945, "grad/layer_12/mlp": 0.006499588955193758, "grad/layer_12/attn_mlp_ratio": 1.2026902983740893, "grad/layer_16/attn": 0.003483323846012354, "grad/layer_16/mlp": 0.0044373380951583385, "grad/layer_16/attn_mlp_ratio": 0.7850030114479606, "grad/layer_20/attn": 0.002993810223415494, "grad/layer_20/mlp": 0.005585554521530867, "grad/layer_20/attn_mlp_ratio": 0.5359915758186539, "grad/layer_24/attn": 0.00629589706659317, "grad/layer_24/mlp": 0.008409857749938965, "grad/layer_24/attn_mlp_ratio": 0.748632994627711, "grad/layer_27/attn": 0.004470232408493757, "grad/layer_27/mlp": 0.008076769299805164, "grad/layer_27/attn_mlp_ratio": 0.5534678764756603} {"step": 11800, "timestamp": 1778338467.7679985, "train/loss": 2.3295177459716796, "train/z_loss": 0.001356867398135364, "train/perplexity": 10.272986149309219, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023027.3138706335, "perf/iters_per_sec": 0.9646545953133743, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036640477180481, "data/tokens_consumed": 24748490752, "data/tokens_consumed_B": 24.748490752, "train/loss_slope": 1.547065586647471e-05} {"step": 11810, "timestamp": 1778338478.1476493, "train/loss": 2.3459481477737425, "train/z_loss": 0.0013484056340530514, "train/perplexity": 10.443169703014581, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021882.0728393472, "perf/iters_per_sec": 0.9641085018345581, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372276544570922, "data/tokens_consumed": 24769462272, "data/tokens_consumed_B": 24.769462272, "train/loss_slope": 1.4376025696327245e-05} {"step": 11820, "timestamp": 1778338488.5110197, "train/loss": 2.3426941871643066, "train/z_loss": 0.0013576923636719584, "train/perplexity": 10.40924326774047, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025107.5273776862, "perf/iters_per_sec": 0.9656465184105331, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355756282806396, "data/tokens_consumed": 24790433792, "data/tokens_consumed_B": 24.790433792, "train/loss_slope": 1.3570910461521913e-05} {"step": 11830, "timestamp": 1778338498.884957, "train/loss": 2.3697871923446656, "train/z_loss": 0.0013556041638366878, "train/perplexity": 10.695116039350625, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022507.2188920595, "perf/iters_per_sec": 0.9644065947017953, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369070529937745, "data/tokens_consumed": 24811405312, "data/tokens_consumed_B": 24.811405312, "train/loss_slope": 1.3658470382141068e-05} {"step": 11840, "timestamp": 1778338509.2495108, "train/loss": 2.370270848274231, "train/z_loss": 0.0013634836068376899, "train/perplexity": 10.700290046759267, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024782.7058241444, "perf/iters_per_sec": 0.9654916314240191, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357417583465576, "data/tokens_consumed": 24832376832, "data/tokens_consumed_B": 24.832376832, "train/loss_slope": 1.590677997281235e-05} {"step": 11850, "timestamp": 1778338519.6025183, "grad/layer_0/attn": 0.0029038372449576855, "grad/layer_0/mlp": 0.0031906820368021727, "grad/layer_0/attn_mlp_ratio": 0.9100991952360461, "grad/layer_4/attn": 0.002005830639973283, "grad/layer_4/mlp": 0.002777533372864127, "grad/layer_4/attn_mlp_ratio": 0.722162544419282, "grad/layer_8/attn": 0.003613156732171774, "grad/layer_8/mlp": 0.0035579863470047712, "grad/layer_8/attn_mlp_ratio": 1.0155060414053705, "grad/layer_12/attn": 0.009565444663167, "grad/layer_12/mlp": 0.007412598934024572, "grad/layer_12/attn_mlp_ratio": 1.2904306059535462, "grad/layer_16/attn": 0.0036936046089977026, "grad/layer_16/mlp": 0.0043870266526937485, "grad/layer_16/attn_mlp_ratio": 0.8419380179821638, "grad/layer_20/attn": 0.0030221138149499893, "grad/layer_20/mlp": 0.005327223800122738, "grad/layer_20/attn_mlp_ratio": 0.5672961887110397, "grad/layer_24/attn": 0.008333121426403522, "grad/layer_24/mlp": 0.009903495199978352, "grad/layer_24/attn_mlp_ratio": 0.8414323603931774, "grad/layer_27/attn": 0.007469660602509975, "grad/layer_27/mlp": 0.009703745134174824, "grad/layer_27/attn_mlp_ratio": 0.7697708897182494} {"step": 11850, "timestamp": 1778338520.2136207, "eos/sharpness": 45.57957649230956, "eos/L0_probe": 2.3158912658691406, "eos/L_plus": 2.563359022140503, "eos/L_minus": 2.524219274520874, "eos/grad_norm": 0.13796256482601166, "eos/embed_grad_frac": 0.12013135105371475, "eos/time_s": 0.608389139175415} {"step": 11850, "timestamp": 1778338520.2335026, "train/loss": 2.302137303352356, "train/z_loss": 0.0013693133485503494, "train/perplexity": 9.995523106011285, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910454.168362966, "perf/iters_per_sec": 0.9109755365195112, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0977243185043335, "data/tokens_consumed": 24853348352, "data/tokens_consumed_B": 24.853348352, "train/loss_slope": 1.1553490701967835e-05} {"step": 11850, "timestamp": 1778338521.6014774, "geo/rankme_last": 429.4604187011719, "geo/layer_0/stable_rank_q_proj": 20.64957046508789, "geo/layer_0/stable_rank_k_proj": 16.893726348876953, "geo/layer_0/stable_rank_o_proj": 44.15107727050781, "geo/layer_0/stable_rank_gate_proj": 125.78965759277344, "geo/layer_0/stable_rank_down_proj": 57.33638000488281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06409155577421188, "geo/layer_0/attn_entropy_mean": 6.238001823425293, "geo/layer_0/attn_entropy_std": 0.45555245876312256, "geo/layer_7/stable_rank_q_proj": 42.02920913696289, "geo/layer_7/stable_rank_k_proj": 38.85480880737305, "geo/layer_7/stable_rank_o_proj": 88.89606475830078, "geo/layer_7/stable_rank_gate_proj": 78.69925689697266, "geo/layer_7/stable_rank_down_proj": 144.461669921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3966122567653656, "geo/layer_7/attn_entropy_mean": 4.745140552520752, "geo/layer_7/attn_entropy_std": 0.7641287446022034, "geo/layer_14/stable_rank_q_proj": 51.798213958740234, "geo/layer_14/stable_rank_k_proj": 42.82604217529297, "geo/layer_14/stable_rank_o_proj": 42.288150787353516, "geo/layer_14/stable_rank_gate_proj": 72.19092559814453, "geo/layer_14/stable_rank_down_proj": 127.48680877685547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3753705620765686, "geo/layer_14/attn_entropy_mean": 5.537398815155029, "geo/layer_14/attn_entropy_std": 0.47776293754577637, "geo/layer_21/stable_rank_q_proj": 38.41431427001953, "geo/layer_21/stable_rank_k_proj": 28.561450958251953, "geo/layer_21/stable_rank_o_proj": 65.4912338256836, "geo/layer_21/stable_rank_gate_proj": 60.449092864990234, "geo/layer_21/stable_rank_down_proj": 48.98652267456055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14059026539325714, "geo/layer_21/attn_entropy_mean": 5.848578453063965, "geo/layer_21/attn_entropy_std": 0.3311781585216522, "geo/layer_27/stable_rank_q_proj": 44.5599365234375, "geo/layer_27/stable_rank_k_proj": 30.30078887939453, "geo/layer_27/stable_rank_o_proj": 107.34627532958984, "geo/layer_27/stable_rank_gate_proj": 70.33164978027344, "geo/layer_27/stable_rank_down_proj": 130.16085815429688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10060624778270721, "geo/layer_27/attn_entropy_mean": 4.313803672790527, "geo/layer_27/attn_entropy_std": 0.6785328984260559, "attnres/final_alpha/block_0": 0.2599480152130127, "attnres/block_norm/0": 1.7810719013214111, "attnres/final_alpha/block_1": 0.0038430867716670036, "attnres/block_norm/1": 50591.80859375, "attnres/final_alpha/block_2": 0.008300626650452614, "attnres/block_norm/2": 29905.509765625, "attnres/final_alpha/block_3": 0.010486839339137077, "attnres/block_norm/3": 72368.6015625, "attnres/final_alpha/block_4": 0.011882432736456394, "attnres/block_norm/4": 17480.28125, "attnres/final_alpha/block_5": 0.6064017415046692, "attnres/block_norm/5": 7228.14404296875, "attnres/final_alpha/block_6": 0.09913724660873413, "attnres/block_norm/6": 48598.68359375, "geo/tier1_time_s": 1.363983392715454, "geo/step": 11850.0, "geo/rankme_slope": 0.00031487131962159865} {"step": 11860, "timestamp": 1778338531.9608843, "train/loss": 2.299317717552185, "train/z_loss": 0.0013599137193523348, "train/perplexity": 9.967379566204144, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788851.429245451, "perf/iters_per_sec": 0.8529908319689994, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1723455429077148, "data/tokens_consumed": 24874319872, "data/tokens_consumed_B": 24.874319872, "train/loss_slope": 9.033104095092729e-06} {"step": 11870, "timestamp": 1778338542.3174465, "train/loss": 2.3218449115753175, "train/z_loss": 0.001365500153042376, "train/perplexity": 10.194464853506723, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026415.886040515, "perf/iters_per_sec": 0.9662703924372268, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349070072174071, "data/tokens_consumed": 24895291392, "data/tokens_consumed_B": 24.895291392, "train/loss_slope": 8.237971996281056e-06} {"step": 11880, "timestamp": 1778338552.6769676, "train/loss": 2.3382360219955443, "train/z_loss": 0.0013721114373765886, "train/perplexity": 10.362940431506873, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025767.0472696319, "perf/iters_per_sec": 0.9659610020015869, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352384805679322, "data/tokens_consumed": 24916262912, "data/tokens_consumed_B": 24.916262912, "train/loss_slope": 8.280572324696142e-06} {"step": 11890, "timestamp": 1778338563.0376263, "train/loss": 2.3524089813232423, "train/z_loss": 0.0013540992164053022, "train/perplexity": 10.51085971568572, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025580.1218098602, "perf/iters_per_sec": 0.9658718689965535, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035334014892578, "data/tokens_consumed": 24937234432, "data/tokens_consumed_B": 24.937234432, "train/loss_slope": 8.501807669780657e-06} {"step": 11900, "timestamp": 1778338573.406306, "grad/layer_0/attn": 0.002937645185738802, "grad/layer_0/mlp": 0.0032079315278679132, "grad/layer_0/attn_mlp_ratio": 0.9157443257889035, "grad/layer_4/attn": 0.0021861260756850243, "grad/layer_4/mlp": 0.0025966926477849483, "grad/layer_4/attn_mlp_ratio": 0.8418886206501909, "grad/layer_8/attn": 0.004204238299280405, "grad/layer_8/mlp": 0.00342686683870852, "grad/layer_8/attn_mlp_ratio": 1.2268460884170929, "grad/layer_12/attn": 0.006105342879891396, "grad/layer_12/mlp": 0.007334548979997635, "grad/layer_12/attn_mlp_ratio": 0.8324087565984853, "grad/layer_16/attn": 0.005238573998212814, "grad/layer_16/mlp": 0.004677505698055029, "grad/layer_16/attn_mlp_ratio": 1.1199502949608497, "grad/layer_20/attn": 0.0031101773492991924, "grad/layer_20/mlp": 0.006330535281449556, "grad/layer_20/attn_mlp_ratio": 0.4912976805110327, "grad/layer_24/attn": 0.008810671977698803, "grad/layer_24/mlp": 0.0103091299533844, "grad/layer_24/attn_mlp_ratio": 0.8546474757883509, "grad/layer_27/attn": 0.007203822024166584, "grad/layer_27/mlp": 0.010285761207342148, "grad/layer_27/attn_mlp_ratio": 0.7003683839157706} {"step": 11900, "timestamp": 1778338573.4219468, "train/loss": 2.3537084102630614, "train/z_loss": 0.0013512238278053702, "train/perplexity": 10.524526708702707, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021003.3753358768, "perf/iters_per_sec": 0.9636895062140831, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376786231994628, "data/tokens_consumed": 24958205952, "data/tokens_consumed_B": 24.958205952, "train/loss_slope": 9.010147984021418e-06} {"step": 11910, "timestamp": 1778338583.7827547, "train/loss": 2.3700612068176268, "train/z_loss": 0.0013602565857581794, "train/perplexity": 10.698047057487761, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025816.735033343, "perf/iters_per_sec": 0.9659846949736324, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035213088989258, "data/tokens_consumed": 24979177472, "data/tokens_consumed_B": 24.979177472, "train/loss_slope": 1.2993896511843557e-05} {"step": 11920, "timestamp": 1778338594.1317677, "train/loss": 2.3175182819366453, "train/z_loss": 0.0013661734410561621, "train/perplexity": 10.150452461040295, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027864.0752042823, "perf/iters_per_sec": 0.9669609428426181, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341679334640503, "data/tokens_consumed": 25000148992, "data/tokens_consumed_B": 25.000148992, "train/loss_slope": 1.0712346735924786e-05} {"step": 11925, "timestamp": 1778338599.9089248, "eos/sharpness": 55.55624961853026, "eos/L0_probe": 2.318488836288452, "eos/L_plus": 2.6472387313842773, "eos/L_minus": 2.5453014373779297, "eos/grad_norm": 0.18395458161830902, "eos/embed_grad_frac": 0.07205990701913834, "eos/time_s": 0.6096620559692383} {"step": 11925, "timestamp": 1778338601.289021, "geo/rankme_last": 429.3719482421875, "geo/layer_0/stable_rank_q_proj": 20.65127182006836, "geo/layer_0/stable_rank_k_proj": 16.886323928833008, "geo/layer_0/stable_rank_o_proj": 44.12752151489258, "geo/layer_0/stable_rank_gate_proj": 125.61642456054688, "geo/layer_0/stable_rank_down_proj": 57.38039016723633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06557094305753708, "geo/layer_0/attn_entropy_mean": 6.235714912414551, "geo/layer_0/attn_entropy_std": 0.4594644606113434, "geo/layer_7/stable_rank_q_proj": 41.9556999206543, "geo/layer_7/stable_rank_k_proj": 38.90361022949219, "geo/layer_7/stable_rank_o_proj": 88.82327270507812, "geo/layer_7/stable_rank_gate_proj": 78.6449203491211, "geo/layer_7/stable_rank_down_proj": 144.39358520507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38755252957344055, "geo/layer_7/attn_entropy_mean": 4.7454633712768555, "geo/layer_7/attn_entropy_std": 0.7554291486740112, "geo/layer_14/stable_rank_q_proj": 51.955284118652344, "geo/layer_14/stable_rank_k_proj": 42.9305419921875, "geo/layer_14/stable_rank_o_proj": 42.25474548339844, "geo/layer_14/stable_rank_gate_proj": 72.17556762695312, "geo/layer_14/stable_rank_down_proj": 127.5438461303711, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38886788487434387, "geo/layer_14/attn_entropy_mean": 5.542513847351074, "geo/layer_14/attn_entropy_std": 0.48003777861595154, "geo/layer_21/stable_rank_q_proj": 38.396270751953125, "geo/layer_21/stable_rank_k_proj": 28.55295753479004, "geo/layer_21/stable_rank_o_proj": 65.49625396728516, "geo/layer_21/stable_rank_gate_proj": 60.31689453125, "geo/layer_21/stable_rank_down_proj": 48.96111297607422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13577499985694885, "geo/layer_21/attn_entropy_mean": 5.855978012084961, "geo/layer_21/attn_entropy_std": 0.3327924609184265, "geo/layer_27/stable_rank_q_proj": 44.58447265625, "geo/layer_27/stable_rank_k_proj": 30.3016414642334, "geo/layer_27/stable_rank_o_proj": 107.43406677246094, "geo/layer_27/stable_rank_gate_proj": 70.34911346435547, "geo/layer_27/stable_rank_down_proj": 130.20233154296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09973713010549545, "geo/layer_27/attn_entropy_mean": 4.319428443908691, "geo/layer_27/attn_entropy_std": 0.6902410387992859, "attnres/final_alpha/block_0": 0.2590203285217285, "attnres/block_norm/0": 1.7810672521591187, "attnres/final_alpha/block_1": 0.003767637303099036, "attnres/block_norm/1": 50822.60546875, "attnres/final_alpha/block_2": 0.008258269168436527, "attnres/block_norm/2": 30035.6796875, "attnres/final_alpha/block_3": 0.01041174866259098, "attnres/block_norm/3": 72594.7890625, "attnres/final_alpha/block_4": 0.011570426635444164, "attnres/block_norm/4": 17450.765625, "attnres/final_alpha/block_5": 0.609432578086853, "attnres/block_norm/5": 7184.43603515625, "attnres/final_alpha/block_6": 0.09753899276256561, "attnres/block_norm/6": 49031.828125, "geo/tier1_time_s": 1.359083652496338, "geo/step": 11925.0, "geo/rankme_slope": 0.000287139015762555} {"step": 11930, "timestamp": 1778338606.4771452, "train/loss": 2.2738842487335207, "train/z_loss": 0.0013707983656786383, "train/perplexity": 9.71707112749754, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699500.1080740027, "perf/iters_per_sec": 0.8103848018999112, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.233981680870056, "data/tokens_consumed": 25021120512, "data/tokens_consumed_B": 25.021120512, "train/loss_slope": 7.268796137349922e-06} {"step": 11940, "timestamp": 1778338616.841287, "train/loss": 2.290970873832703, "train/z_loss": 0.0013692157459445297, "train/perplexity": 9.884529655312306, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024845.8159770707, "perf/iters_per_sec": 0.9655217246899942, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357094764709474, "data/tokens_consumed": 25042092032, "data/tokens_consumed_B": 25.042092032, "train/loss_slope": 3.413639317537389e-06} {"step": 11950, "timestamp": 1778338627.1998007, "grad/layer_0/attn": 0.0026009499561041594, "grad/layer_0/mlp": 0.003226465778425336, "grad/layer_0/attn_mlp_ratio": 0.8061296954962818, "grad/layer_4/attn": 0.00247405469417572, "grad/layer_4/mlp": 0.002548952354118228, "grad/layer_4/attn_mlp_ratio": 0.9706162585255371, "grad/layer_8/attn": 0.004051243886351585, "grad/layer_8/mlp": 0.0033419018145650625, "grad/layer_8/attn_mlp_ratio": 1.212256969211151, "grad/layer_12/attn": 0.008299075067043304, "grad/layer_12/mlp": 0.0069327587261796, "grad/layer_12/attn_mlp_ratio": 1.1970811728952981, "grad/layer_16/attn": 0.0035137145314365625, "grad/layer_16/mlp": 0.004641728941351175, "grad/layer_16/attn_mlp_ratio": 0.7569839816444229, "grad/layer_20/attn": 0.0025532138533890247, "grad/layer_20/mlp": 0.006117099430412054, "grad/layer_20/attn_mlp_ratio": 0.4173896207990975, "grad/layer_24/attn": 0.007602672558277845, "grad/layer_24/mlp": 0.010190075263381004, "grad/layer_24/attn_mlp_ratio": 0.7460859990887571, "grad/layer_27/attn": 0.005575200077146292, "grad/layer_27/mlp": 0.011372590437531471, "grad/layer_27/attn_mlp_ratio": 0.49023132053749663} {"step": 11950, "timestamp": 1778338627.2149227, "train/loss": 2.3371817827224732, "train/z_loss": 0.001353712915442884, "train/perplexity": 10.352021169488234, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023320.482272981, "perf/iters_per_sec": 0.9647943889012246, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036490273475647, "data/tokens_consumed": 25063063552, "data/tokens_consumed_B": 25.063063552, "train/loss_slope": 3.618275521933887e-06} {"step": 11960, "timestamp": 1778338637.5975566, "train/loss": 2.3287844181060793, "train/z_loss": 0.0013625952298752964, "train/perplexity": 10.265455443878556, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020779.1197849414, "perf/iters_per_sec": 0.9635825728344638, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377937793731689, "data/tokens_consumed": 25084035072, "data/tokens_consumed_B": 25.084035072, "train/loss_slope": 2.0913899928429554e-06} {"step": 11970, "timestamp": 1778338647.9600446, "train/loss": 2.3131396055221556, "train/z_loss": 0.0013675908092409372, "train/perplexity": 10.10610407871748, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025512.7680582134, "perf/iters_per_sec": 0.9658397522250239, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353684425354004, "data/tokens_consumed": 25105006592, "data/tokens_consumed_B": 25.105006592, "train/loss_slope": 2.8199734789381485e-06} {"step": 11980, "timestamp": 1778338658.3215127, "train/loss": 2.3469294786453245, "train/z_loss": 0.0013540467014536261, "train/perplexity": 10.453422937926465, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025532.5445955559, "perf/iters_per_sec": 0.9658491824128894, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353583335876464, "data/tokens_consumed": 25125978112, "data/tokens_consumed_B": 25.125978112, "train/loss_slope": 3.37538609016844e-06} {"step": 11990, "timestamp": 1778338668.697359, "train/loss": 2.3356899976730348, "train/z_loss": 0.0013559987768530847, "train/perplexity": 10.3365896921615, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022242.5062483437, "perf/iters_per_sec": 0.9642803698770255, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037042784690857, "data/tokens_consumed": 25146949632, "data/tokens_consumed_B": 25.146949632, "train/loss_slope": 2.800932396935356e-06} {"step": 12000, "timestamp": 1778338679.0480769, "grad/layer_0/attn": 0.0036099767312407494, "grad/layer_0/mlp": 0.0037929750978946686, "grad/layer_0/attn_mlp_ratio": 0.9517533184093863, "grad/layer_4/attn": 0.0020407226402312517, "grad/layer_4/mlp": 0.002658877754583955, "grad/layer_4/attn_mlp_ratio": 0.7675127447893133, "grad/layer_8/attn": 0.0066794524900615215, "grad/layer_8/mlp": 0.003449698444455862, "grad/layer_8/attn_mlp_ratio": 1.9362423713214931, "grad/layer_12/attn": 0.009512818418443203, "grad/layer_12/mlp": 0.0070340451784431934, "grad/layer_12/attn_mlp_ratio": 1.3523965288646285, "grad/layer_16/attn": 0.005297727882862091, "grad/layer_16/mlp": 0.005034841597080231, "grad/layer_16/attn_mlp_ratio": 1.0522133964875822, "grad/layer_20/attn": 0.003576657036319375, "grad/layer_20/mlp": 0.007201463915407658, "grad/layer_20/attn_mlp_ratio": 0.49665693373834197, "grad/layer_24/attn": 0.014135267585515976, "grad/layer_24/mlp": 0.014250986278057098, "grad/layer_24/attn_mlp_ratio": 0.991879945045818, "grad/layer_27/attn": 0.015464442782104015, "grad/layer_27/mlp": 0.013520236127078533, "grad/layer_27/attn_mlp_ratio": 1.1437997474579324} {"step": 12000, "timestamp": 1778338679.630684, "eos/sharpness": 70.44510841369627, "eos/L0_probe": 2.312647581100464, "eos/L_plus": 2.618300199508667, "eos/L_minus": 2.7114460468292236, "eos/grad_norm": 0.2688722312450409, "eos/embed_grad_frac": 0.036898184567689896, "eos/time_s": 0.5796630382537842} {"step": 12000, "timestamp": 1778338679.6512642, "train/loss": 2.3553853034973145, "train/z_loss": 0.001366478472482413, "train/perplexity": 10.542190021940417, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915694.0249551572, "perf/iters_per_sec": 0.9134740948463236, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.094721794128418, "data/tokens_consumed": 25167921152, "data/tokens_consumed_B": 25.167921152, "train/loss_slope": 3.13720520001218e-06} {"step": 12000, "timestamp": 1778338681.0200636, "geo/rankme_last": 430.1318054199219, "geo/layer_0/stable_rank_q_proj": 20.655969619750977, "geo/layer_0/stable_rank_k_proj": 16.930452346801758, "geo/layer_0/stable_rank_o_proj": 44.07441711425781, "geo/layer_0/stable_rank_gate_proj": 125.71308898925781, "geo/layer_0/stable_rank_down_proj": 57.43476486206055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0635296031832695, "geo/layer_0/attn_entropy_mean": 6.235506057739258, "geo/layer_0/attn_entropy_std": 0.45649218559265137, "geo/layer_7/stable_rank_q_proj": 41.99247741699219, "geo/layer_7/stable_rank_k_proj": 38.88335037231445, "geo/layer_7/stable_rank_o_proj": 88.82421875, "geo/layer_7/stable_rank_gate_proj": 78.74083709716797, "geo/layer_7/stable_rank_down_proj": 144.69981384277344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4058155119419098, "geo/layer_7/attn_entropy_mean": 4.7450737953186035, "geo/layer_7/attn_entropy_std": 0.7763100862503052, "geo/layer_14/stable_rank_q_proj": 51.94684982299805, "geo/layer_14/stable_rank_k_proj": 42.949127197265625, "geo/layer_14/stable_rank_o_proj": 42.22975540161133, "geo/layer_14/stable_rank_gate_proj": 72.04190063476562, "geo/layer_14/stable_rank_down_proj": 127.54914093017578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3758629858493805, "geo/layer_14/attn_entropy_mean": 5.497166156768799, "geo/layer_14/attn_entropy_std": 0.48974698781967163, "geo/layer_21/stable_rank_q_proj": 38.37952423095703, "geo/layer_21/stable_rank_k_proj": 28.540456771850586, "geo/layer_21/stable_rank_o_proj": 65.34456634521484, "geo/layer_21/stable_rank_gate_proj": 60.20383071899414, "geo/layer_21/stable_rank_down_proj": 48.95383071899414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13299459218978882, "geo/layer_21/attn_entropy_mean": 5.829120635986328, "geo/layer_21/attn_entropy_std": 0.34770092368125916, "geo/layer_27/stable_rank_q_proj": 44.53440856933594, "geo/layer_27/stable_rank_k_proj": 30.32868766784668, "geo/layer_27/stable_rank_o_proj": 107.54127502441406, "geo/layer_27/stable_rank_gate_proj": 70.34516143798828, "geo/layer_27/stable_rank_down_proj": 130.3626708984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09532377123832703, "geo/layer_27/attn_entropy_mean": 4.309869766235352, "geo/layer_27/attn_entropy_std": 0.696057140827179, "attnres/final_alpha/block_0": 0.26098930835723877, "attnres/block_norm/0": 1.781126618385315, "attnres/final_alpha/block_1": 0.003857541363686323, "attnres/block_norm/1": 50743.9765625, "attnres/final_alpha/block_2": 0.00841003842651844, "attnres/block_norm/2": 29956.14453125, "attnres/final_alpha/block_3": 0.010510736145079136, "attnres/block_norm/3": 72436.8515625, "attnres/final_alpha/block_4": 0.011789508163928986, "attnres/block_norm/4": 17418.1796875, "attnres/final_alpha/block_5": 0.6037007570266724, "attnres/block_norm/5": 7215.9248046875, "attnres/final_alpha/block_6": 0.10074213147163391, "attnres/block_norm/6": 48480.09765625, "geo/tier1_time_s": 1.3654391765594482, "geo/step": 12000.0, "geo/rankme_slope": 0.0002761455363395358} {"step": 12000, "timestamp": 1778338687.9924908, "geo/ww_alpha_mean": 7.929123587967027, "geo/ww_alpha_std": 5.822983600500456, "geo/ww_alpha_min": 1.3381342041681545, "geo/ww_alpha_max": 53.80076113077609, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.955878187826034, "geo/ww_alpha_by_type/k_proj": 4.418154920999145, "geo/ww_alpha_by_type/v_proj": 11.01997471494235, "geo/ww_alpha_by_type/o_proj": 8.192095000307793, "geo/ww_alpha_by_type/gate_proj": 7.778681279369091, "geo/ww_alpha_by_type/up_proj": 12.069430775280647, "geo/ww_alpha_by_type/down_proj": 8.175211360169572, "geo/twonn_id/layer_0": 0.7324497103691101, "geo/twonn_id/layer_7": 3.852822780609131, "geo/twonn_id/layer_14": 4.8274641036987305, "geo/twonn_id/layer_21": 7.963314533233643, "geo/twonn_id/layer_27": 6.605020999908447, "geo/tier2_time_s": 6.966041564941406} {"step": 12000, "timestamp": 1778338688.9340055, "eoc/jacobian_sigma/layer_0/attn": 1472.63720703125, "eoc/jacobian_sigma/layer_0/mlp": 10572.6376953125, "eoc/jacobian_sigma/layer_0": 10572.6376953125, "eoc/jacobian_sigma/layer_7/attn": 1.148633599281311, "eoc/jacobian_sigma/layer_7/mlp": 1.8443704843521118, "eoc/jacobian_sigma/layer_7": 1.8443704843521118, "eoc/jacobian_sigma/layer_14/attn": 1.9271618127822876, "eoc/jacobian_sigma/layer_14/mlp": 13.273945808410645, "eoc/jacobian_sigma/layer_14": 13.273945808410645, "eoc/jacobian_sigma/layer_21/attn": 1.0983246564865112, "eoc/jacobian_sigma/layer_21/mlp": 5.323849201202393, "eoc/jacobian_sigma/layer_21": 5.323849201202393, "eoc/jacobian_sigma/layer_27/attn": 3.9700629711151123, "eoc/jacobian_sigma/layer_27/mlp": 39.189903259277344, "eoc/jacobian_sigma/layer_27": 39.189903259277344, "eoc/layer0_sigma": 10572.6376953125, "eoc/sigma_max": 39.189903259277344, "eoc/sigma_min": 1.8443704843521118, "eoc/sigma_mean": 14.908017188310623, "eoc/time_s": 0.934689998626709} {"step": 12010, "timestamp": 1778338699.833181, "train/loss": 2.3903738498687743, "train/z_loss": 0.0013605314190499485, "train/perplexity": 10.917574714070778, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1039316.2002925935, "perf/iters_per_sec": 0.49558458342199013, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 2.0178190231323243, "data/tokens_consumed": 25188892672, "data/tokens_consumed_B": 25.188892672, "train/loss_slope": 3.337141217822038e-06} {"step": 12020, "timestamp": 1778338710.1990695, "train/loss": 2.33444344997406, "train/z_loss": 0.001352938381023705, "train/perplexity": 10.323712667645577, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024798.9257832489, "perf/iters_per_sec": 0.9654993657032246, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357334613800049, "data/tokens_consumed": 25209864192, "data/tokens_consumed_B": 25.209864192, "train/loss_slope": 4.616843849340064e-06} {"step": 12030, "timestamp": 1778338720.559927, "train/loss": 2.349672031402588, "train/z_loss": 0.0013584615546278656, "train/perplexity": 10.482131351002014, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025714.3295844274, "perf/iters_per_sec": 0.9659358642503869, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352654218673707, "data/tokens_consumed": 25230835712, "data/tokens_consumed_B": 25.230835712, "train/loss_slope": 4.479771550267523e-06} {"step": 12040, "timestamp": 1778338730.9206793, "train/loss": 2.2895340442657472, "train/z_loss": 0.0013681759592145681, "train/perplexity": 9.870337469166065, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025267.4591607617, "perf/iters_per_sec": 0.9657227798274811, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354938507080078, "data/tokens_consumed": 25251807232, "data/tokens_consumed_B": 25.251807232, "train/loss_slope": 9.61325099222901e-07} {"step": 12050, "timestamp": 1778338741.2746718, "grad/layer_0/attn": 0.003037857823073864, "grad/layer_0/mlp": 0.003374115563929081, "grad/layer_0/attn_mlp_ratio": 0.9003419341992407, "grad/layer_4/attn": 0.0019654175266623497, "grad/layer_4/mlp": 0.002701616147533059, "grad/layer_4/attn_mlp_ratio": 0.7274969302013347, "grad/layer_8/attn": 0.006037777289748192, "grad/layer_8/mlp": 0.003753500524908304, "grad/layer_8/attn_mlp_ratio": 1.6085723416912154, "grad/layer_12/attn": 0.005315438378602266, "grad/layer_12/mlp": 0.007591055240482092, "grad/layer_12/attn_mlp_ratio": 0.7002239003917858, "grad/layer_16/attn": 0.003854169975966215, "grad/layer_16/mlp": 0.005022712051868439, "grad/layer_16/attn_mlp_ratio": 0.7673483686562591, "grad/layer_20/attn": 0.0037453393451869488, "grad/layer_20/mlp": 0.005831487942487001, "grad/layer_20/attn_mlp_ratio": 0.642261343571176, "grad/layer_24/attn": 0.009119418449699879, "grad/layer_24/mlp": 0.00978703424334526, "grad/layer_24/attn_mlp_ratio": 0.9317856798878682, "grad/layer_27/attn": 0.008708376437425613, "grad/layer_27/mlp": 0.008339593186974525, "grad/layer_27/attn_mlp_ratio": 1.0442207596654725} {"step": 12050, "timestamp": 1778338741.2898977, "train/loss": 2.3288384437561036, "train/z_loss": 0.0013586436049081384, "train/perplexity": 10.266010056763234, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023803.6974907469, "perf/iters_per_sec": 0.9650248038724646, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362427949905395, "data/tokens_consumed": 25272778752, "data/tokens_consumed_B": 25.272778752, "train/loss_slope": -1.8253635008199521e-06} {"step": 12060, "timestamp": 1778338751.646828, "train/loss": 2.331546902656555, "train/z_loss": 0.0013580605038441718, "train/perplexity": 10.29385281153146, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026038.9834726397, "perf/iters_per_sec": 0.9660906712878417, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350995302200316, "data/tokens_consumed": 25293750272, "data/tokens_consumed_B": 25.293750272, "train/loss_slope": -1.6181461953415266e-06} {"step": 12070, "timestamp": 1778338762.015779, "train/loss": 2.3175376892089843, "train/z_loss": 0.0013617414282634855, "train/perplexity": 10.150649455547127, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023580.6822160035, "perf/iters_per_sec": 0.9649184619026201, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363569974899292, "data/tokens_consumed": 25314721792, "data/tokens_consumed_B": 25.314721792, "train/loss_slope": -1.6811896329021549e-06} {"step": 12075, "timestamp": 1778338767.7930207, "eos/sharpness": 20.791125297546383, "eos/L0_probe": 2.315438747406006, "eos/L_plus": 2.399684190750122, "eos/L_minus": 2.4391045570373535, "eos/grad_norm": 0.10373950749635696, "eos/embed_grad_frac": 0.22399045526981354, "eos/time_s": 0.6055452823638916} {"step": 12075, "timestamp": 1778338769.1767383, "geo/rankme_last": 429.8322448730469, "geo/layer_0/stable_rank_q_proj": 20.695537567138672, "geo/layer_0/stable_rank_k_proj": 16.924283981323242, "geo/layer_0/stable_rank_o_proj": 44.02610397338867, "geo/layer_0/stable_rank_gate_proj": 125.47248077392578, "geo/layer_0/stable_rank_down_proj": 57.50939178466797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06171736121177673, "geo/layer_0/attn_entropy_mean": 6.239083290100098, "geo/layer_0/attn_entropy_std": 0.45340171456336975, "geo/layer_7/stable_rank_q_proj": 41.918514251708984, "geo/layer_7/stable_rank_k_proj": 38.87662887573242, "geo/layer_7/stable_rank_o_proj": 88.65730285644531, "geo/layer_7/stable_rank_gate_proj": 78.57353973388672, "geo/layer_7/stable_rank_down_proj": 144.66468811035156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3988012671470642, "geo/layer_7/attn_entropy_mean": 4.719839572906494, "geo/layer_7/attn_entropy_std": 0.7535960674285889, "geo/layer_14/stable_rank_q_proj": 51.95597839355469, "geo/layer_14/stable_rank_k_proj": 43.0007438659668, "geo/layer_14/stable_rank_o_proj": 42.26171112060547, "geo/layer_14/stable_rank_gate_proj": 72.12384033203125, "geo/layer_14/stable_rank_down_proj": 127.55040740966797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37022873759269714, "geo/layer_14/attn_entropy_mean": 5.498161315917969, "geo/layer_14/attn_entropy_std": 0.4800933003425598, "geo/layer_21/stable_rank_q_proj": 38.38743591308594, "geo/layer_21/stable_rank_k_proj": 28.531497955322266, "geo/layer_21/stable_rank_o_proj": 65.25711059570312, "geo/layer_21/stable_rank_gate_proj": 60.23992919921875, "geo/layer_21/stable_rank_down_proj": 48.97206497192383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1407870203256607, "geo/layer_21/attn_entropy_mean": 5.842533111572266, "geo/layer_21/attn_entropy_std": 0.3340083360671997, "geo/layer_27/stable_rank_q_proj": 44.573646545410156, "geo/layer_27/stable_rank_k_proj": 30.321170806884766, "geo/layer_27/stable_rank_o_proj": 107.50959777832031, "geo/layer_27/stable_rank_gate_proj": 70.26944732666016, "geo/layer_27/stable_rank_down_proj": 130.12643432617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10505232214927673, "geo/layer_27/attn_entropy_mean": 4.317571640014648, "geo/layer_27/attn_entropy_std": 0.682826817035675, "attnres/final_alpha/block_0": 0.2612517178058624, "attnres/block_norm/0": 1.781172752380371, "attnres/final_alpha/block_1": 0.0039282385259866714, "attnres/block_norm/1": 50652.06640625, "attnres/final_alpha/block_2": 0.008350919000804424, "attnres/block_norm/2": 29882.046875, "attnres/final_alpha/block_3": 0.010478656738996506, "attnres/block_norm/3": 72356.890625, "attnres/final_alpha/block_4": 0.011683416552841663, "attnres/block_norm/4": 17450.58984375, "attnres/final_alpha/block_5": 0.6033914685249329, "attnres/block_norm/5": 7180.8076171875, "attnres/final_alpha/block_6": 0.1009155809879303, "attnres/block_norm/6": 48521.984375, "geo/tier1_time_s": 1.3641862869262695, "geo/step": 12075.0, "geo/rankme_slope": 0.0002835964268519908} {"step": 12080, "timestamp": 1778338774.3627932, "train/loss": 2.3189697742462156, "train/z_loss": 0.0013629603083245457, "train/perplexity": 10.165196462539956, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699540.3990175694, "perf/iters_per_sec": 0.8104040141189429, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2339524269104003, "data/tokens_consumed": 25335693312, "data/tokens_consumed_B": 25.335693312, "train/loss_slope": -2.695892195496978e-06} {"step": 12090, "timestamp": 1778338784.7153914, "train/loss": 2.3335439443588255, "train/z_loss": 0.0013573861215263605, "train/perplexity": 10.314430605390372, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026542.9676495648, "perf/iters_per_sec": 0.966330989670546, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348421096801759, "data/tokens_consumed": 25356664832, "data/tokens_consumed_B": 25.356664832, "train/loss_slope": -6.313678490804218e-07} {"step": 12100, "timestamp": 1778338795.06121, "grad/layer_0/attn": 0.0029289009980857372, "grad/layer_0/mlp": 0.003203092608600855, "grad/layer_0/attn_mlp_ratio": 0.9143978225235674, "grad/layer_4/attn": 0.0022861540783196688, "grad/layer_4/mlp": 0.0025454761926084757, "grad/layer_4/attn_mlp_ratio": 0.8981242861927947, "grad/layer_8/attn": 0.0030876288656145334, "grad/layer_8/mlp": 0.0033944149035960436, "grad/layer_8/attn_mlp_ratio": 0.90962032112853, "grad/layer_12/attn": 0.005461947992444038, "grad/layer_12/mlp": 0.007020607590675354, "grad/layer_12/attn_mlp_ratio": 0.7779879225695093, "grad/layer_16/attn": 0.003948193974792957, "grad/layer_16/mlp": 0.004315439611673355, "grad/layer_16/attn_mlp_ratio": 0.9148995788570536, "grad/layer_20/attn": 0.002770035993307829, "grad/layer_20/mlp": 0.00594133697450161, "grad/layer_20/attn_mlp_ratio": 0.46623107872400826, "grad/layer_24/attn": 0.010305632837116718, "grad/layer_24/mlp": 0.010751581750810146, "grad/layer_24/attn_mlp_ratio": 0.9585224741920348, "grad/layer_27/attn": 0.007289922330528498, "grad/layer_27/mlp": 0.009663387201726437, "grad/layer_27/attn_mlp_ratio": 0.7543858176134676} {"step": 12100, "timestamp": 1778338795.076191, "train/loss": 2.4077690124511717, "train/z_loss": 0.0013514265534467994, "train/perplexity": 11.1091491043499, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025332.6980653005, "perf/iters_per_sec": 0.9657538881613257, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354604959487914, "data/tokens_consumed": 25377636352, "data/tokens_consumed_B": 25.377636352, "train/loss_slope": 4.120375306764379e-06} {"step": 12110, "timestamp": 1778338805.4396422, "train/loss": 2.3439468145370483, "train/z_loss": 0.001364593580365181, "train/perplexity": 10.422290340641508, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025039.0395375483, "perf/iters_per_sec": 0.9656138608634702, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356106519699098, "data/tokens_consumed": 25398607872, "data/tokens_consumed_B": 25.398607872, "train/loss_slope": 6.55139812363043e-06} {"step": 12120, "timestamp": 1778338815.7972624, "train/loss": 2.3377523183822633, "train/z_loss": 0.0013579050078988074, "train/perplexity": 10.357929051884861, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025778.8041541476, "perf/iters_per_sec": 0.9659666081209887, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352324724197388, "data/tokens_consumed": 25419579392, "data/tokens_consumed_B": 25.419579392, "train/loss_slope": 5.154421806621597e-06} {"step": 12130, "timestamp": 1778338826.1572545, "train/loss": 2.3665536642074585, "train/z_loss": 0.0013658195734024049, "train/perplexity": 10.66058893297543, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025610.5817293562, "perf/iters_per_sec": 0.9658863934180051, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353184461593627, "data/tokens_consumed": 25440550912, "data/tokens_consumed_B": 25.440550912, "train/loss_slope": 6.111173918752967e-06} {"step": 12140, "timestamp": 1778338836.5558023, "train/loss": 2.380113410949707, "train/z_loss": 0.001363920234143734, "train/perplexity": 10.80612832771574, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018310.587778327, "perf/iters_per_sec": 0.9624054850474963, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03906307220459, "data/tokens_consumed": 25461522432, "data/tokens_consumed_B": 25.461522432, "train/loss_slope": 6.8328411057658435e-06} {"step": 12150, "timestamp": 1778338846.9024534, "grad/layer_0/attn": 0.0040592472068965435, "grad/layer_0/mlp": 0.0038335404824465513, "grad/layer_0/attn_mlp_ratio": 1.0588768058132692, "grad/layer_4/attn": 0.0018794512143358588, "grad/layer_4/mlp": 0.0027426229789853096, "grad/layer_4/attn_mlp_ratio": 0.6852750670468364, "grad/layer_8/attn": 0.0034140574280172586, "grad/layer_8/mlp": 0.003607474034652114, "grad/layer_8/attn_mlp_ratio": 0.9463844508885142, "grad/layer_12/attn": 0.0067957378923892975, "grad/layer_12/mlp": 0.007502526044845581, "grad/layer_12/attn_mlp_ratio": 0.9057932969761306, "grad/layer_16/attn": 0.003403760027140379, "grad/layer_16/mlp": 0.004527873359620571, "grad/layer_16/attn_mlp_ratio": 0.7517347950412044, "grad/layer_20/attn": 0.003153679659590125, "grad/layer_20/mlp": 0.0059836735017597675, "grad/layer_20/attn_mlp_ratio": 0.527047407576283, "grad/layer_24/attn": 0.007927251048386097, "grad/layer_24/mlp": 0.00988711602985859, "grad/layer_24/attn_mlp_ratio": 0.8017758610568153, "grad/layer_27/attn": 0.005481739994138479, "grad/layer_27/mlp": 0.010381804779171944, "grad/layer_27/attn_mlp_ratio": 0.5280141611152785} {"step": 12150, "timestamp": 1778338847.4962554, "eos/sharpness": 49.14016723632812, "eos/L0_probe": 2.3143601417541504, "eos/L_plus": 2.5522913932800293, "eos/L_minus": 2.5678305625915527, "eos/grad_norm": 0.153656467795372, "eos/embed_grad_frac": 0.1138952299952507, "eos/time_s": 0.5907740592956543} {"step": 12150, "timestamp": 1778338847.5160208, "train/loss": 2.3216133594512938, "train/z_loss": 0.0013582189450971783, "train/perplexity": 10.192104576790697, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914466.5654569678, "perf/iters_per_sec": 0.9128887965473975, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0954236745834351, "data/tokens_consumed": 25482493952, "data/tokens_consumed_B": 25.482493952, "train/loss_slope": 6.151285623595621e-06} {"step": 12150, "timestamp": 1778338848.882191, "geo/rankme_last": 429.8494873046875, "geo/layer_0/stable_rank_q_proj": 20.650619506835938, "geo/layer_0/stable_rank_k_proj": 16.92820167541504, "geo/layer_0/stable_rank_o_proj": 43.97167205810547, "geo/layer_0/stable_rank_gate_proj": 125.6874008178711, "geo/layer_0/stable_rank_down_proj": 57.532493591308594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06381481885910034, "geo/layer_0/attn_entropy_mean": 6.228921890258789, "geo/layer_0/attn_entropy_std": 0.4526735246181488, "geo/layer_7/stable_rank_q_proj": 41.98090744018555, "geo/layer_7/stable_rank_k_proj": 38.90595245361328, "geo/layer_7/stable_rank_o_proj": 88.6976089477539, "geo/layer_7/stable_rank_gate_proj": 78.58804321289062, "geo/layer_7/stable_rank_down_proj": 144.6509552001953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40206897258758545, "geo/layer_7/attn_entropy_mean": 4.7688188552856445, "geo/layer_7/attn_entropy_std": 0.7752841711044312, "geo/layer_14/stable_rank_q_proj": 52.04322814941406, "geo/layer_14/stable_rank_k_proj": 42.88447952270508, "geo/layer_14/stable_rank_o_proj": 42.219642639160156, "geo/layer_14/stable_rank_gate_proj": 72.14412689208984, "geo/layer_14/stable_rank_down_proj": 127.66032409667969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38240352272987366, "geo/layer_14/attn_entropy_mean": 5.543208122253418, "geo/layer_14/attn_entropy_std": 0.4684835374355316, "geo/layer_21/stable_rank_q_proj": 38.35360336303711, "geo/layer_21/stable_rank_k_proj": 28.539920806884766, "geo/layer_21/stable_rank_o_proj": 65.20276641845703, "geo/layer_21/stable_rank_gate_proj": 60.17833709716797, "geo/layer_21/stable_rank_down_proj": 48.937042236328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13718898594379425, "geo/layer_21/attn_entropy_mean": 5.86760950088501, "geo/layer_21/attn_entropy_std": 0.3327692747116089, "geo/layer_27/stable_rank_q_proj": 44.56721115112305, "geo/layer_27/stable_rank_k_proj": 30.294931411743164, "geo/layer_27/stable_rank_o_proj": 107.421142578125, "geo/layer_27/stable_rank_gate_proj": 70.22827911376953, "geo/layer_27/stable_rank_down_proj": 130.197265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10999059677124023, "geo/layer_27/attn_entropy_mean": 4.314720630645752, "geo/layer_27/attn_entropy_std": 0.7029143571853638, "attnres/final_alpha/block_0": 0.2620862126350403, "attnres/block_norm/0": 1.781329870223999, "attnres/final_alpha/block_1": 0.0038169806357473135, "attnres/block_norm/1": 50612.46484375, "attnres/final_alpha/block_2": 0.00842225831001997, "attnres/block_norm/2": 29976.0078125, "attnres/final_alpha/block_3": 0.010620377957820892, "attnres/block_norm/3": 72809.6015625, "attnres/final_alpha/block_4": 0.012039022520184517, "attnres/block_norm/4": 17534.52734375, "attnres/final_alpha/block_5": 0.603071391582489, "attnres/block_norm/5": 7310.9677734375, "attnres/final_alpha/block_6": 0.09994378685951233, "attnres/block_norm/6": 48566.44921875, "geo/tier1_time_s": 1.3619134426116943, "geo/step": 12150.0, "geo/rankme_slope": 0.00029657915900735296} {"step": 12160, "timestamp": 1778338859.2416434, "train/loss": 2.3278459787368773, "train/z_loss": 0.0013582545216195285, "train/perplexity": 10.255826455164941, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789171.3364862772, "perf/iters_per_sec": 0.8531433756286035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1721359252929688, "data/tokens_consumed": 25503465472, "data/tokens_consumed_B": 25.503465472, "train/loss_slope": 6.853629961194041e-06} {"step": 12170, "timestamp": 1778338869.6050801, "train/loss": 2.328089714050293, "train/z_loss": 0.0013601088081486522, "train/perplexity": 10.258326466898524, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025070.741978211, "perf/iters_per_sec": 0.965628977765184, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355944395065309, "data/tokens_consumed": 25524436992, "data/tokens_consumed_B": 25.524436992, "train/loss_slope": 4.51797629275408e-06} {"step": 12180, "timestamp": 1778338879.9647832, "train/loss": 2.292864537239075, "train/z_loss": 0.0013767510885372759, "train/perplexity": 9.903265361371457, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025370.3324257026, "perf/iters_per_sec": 0.9657718336227906, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035441255569458, "data/tokens_consumed": 25545408512, "data/tokens_consumed_B": 25.545408512, "train/loss_slope": 1.8548370731009262e-06} {"step": 12190, "timestamp": 1778338890.3200662, "train/loss": 2.306830549240112, "train/z_loss": 0.0013721455470658839, "train/perplexity": 10.042544809622614, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026530.7816905018, "perf/iters_per_sec": 0.9663251789524564, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348483324050903, "data/tokens_consumed": 25566380032, "data/tokens_consumed_B": 25.566380032, "train/loss_slope": 5.021459949721154e-07} {"step": 12200, "timestamp": 1778338900.6681201, "grad/layer_0/attn": 0.002780113834887743, "grad/layer_0/mlp": 0.003149142023175955, "grad/layer_0/attn_mlp_ratio": 0.8828162484086169, "grad/layer_4/attn": 0.0019492864375934005, "grad/layer_4/mlp": 0.002590530551970005, "grad/layer_4/attn_mlp_ratio": 0.7524660772151214, "grad/layer_8/attn": 0.003286520252004266, "grad/layer_8/mlp": 0.003368481993675232, "grad/layer_8/attn_mlp_ratio": 0.9756680191873781, "grad/layer_12/attn": 0.008727054111659527, "grad/layer_12/mlp": 0.006935357116162777, "grad/layer_12/attn_mlp_ratio": 1.2583424097206153, "grad/layer_16/attn": 0.0031765291932970285, "grad/layer_16/mlp": 0.0040525528602302074, "grad/layer_16/attn_mlp_ratio": 0.7838340977822988, "grad/layer_20/attn": 0.003330077975988388, "grad/layer_20/mlp": 0.005472376476973295, "grad/layer_20/attn_mlp_ratio": 0.6085250035607407, "grad/layer_24/attn": 0.007259901147335768, "grad/layer_24/mlp": 0.009598375298082829, "grad/layer_24/attn_mlp_ratio": 0.7563677024745099, "grad/layer_27/attn": 0.005606208462268114, "grad/layer_27/mlp": 0.009118477813899517, "grad/layer_27/attn_mlp_ratio": 0.6148184505357451} {"step": 12200, "timestamp": 1778338900.682919, "train/loss": 2.3426220655441283, "train/z_loss": 0.0013599210185930132, "train/perplexity": 10.408492563322506, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025067.7115549473, "perf/iters_per_sec": 0.9656275327467667, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035595989227295, "data/tokens_consumed": 25587351552, "data/tokens_consumed_B": 25.587351552, "train/loss_slope": 2.544652296192615e-07} {"step": 12210, "timestamp": 1778338911.0458753, "train/loss": 2.347750496864319, "train/z_loss": 0.0013531018630601466, "train/perplexity": 10.462008912747939, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024765.6005740918, "perf/iters_per_sec": 0.9654834750051936, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357505083084106, "data/tokens_consumed": 25608323072, "data/tokens_consumed_B": 25.608323072, "train/loss_slope": 1.822914706669935e-06} {"step": 12220, "timestamp": 1778338921.4024403, "train/loss": 2.3715245962142943, "train/z_loss": 0.0013576438534073532, "train/perplexity": 10.713713926685976, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026329.1510231378, "perf/iters_per_sec": 0.966229033958024, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349513053894044, "data/tokens_consumed": 25629294592, "data/tokens_consumed_B": 25.629294592, "train/loss_slope": 4.010525673958684e-06} {"step": 12225, "timestamp": 1778338927.1602674, "eos/sharpness": 59.44635868072508, "eos/L0_probe": 2.3166310787200928, "eos/L_plus": 2.592621326446533, "eos/L_minus": 2.6351044178009033, "eos/grad_norm": 0.21960042417049408, "eos/embed_grad_frac": 0.05056663602590561, "eos/time_s": 0.5838541984558105} {"step": 12225, "timestamp": 1778338928.5383193, "geo/rankme_last": 429.82403564453125, "geo/layer_0/stable_rank_q_proj": 20.6478214263916, "geo/layer_0/stable_rank_k_proj": 16.916000366210938, "geo/layer_0/stable_rank_o_proj": 43.91775131225586, "geo/layer_0/stable_rank_gate_proj": 125.44385528564453, "geo/layer_0/stable_rank_down_proj": 57.614933013916016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06481482833623886, "geo/layer_0/attn_entropy_mean": 6.23035192489624, "geo/layer_0/attn_entropy_std": 0.4525887072086334, "geo/layer_7/stable_rank_q_proj": 41.95506286621094, "geo/layer_7/stable_rank_k_proj": 38.833465576171875, "geo/layer_7/stable_rank_o_proj": 88.60484313964844, "geo/layer_7/stable_rank_gate_proj": 78.3987808227539, "geo/layer_7/stable_rank_down_proj": 144.59800720214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3923586905002594, "geo/layer_7/attn_entropy_mean": 4.728481292724609, "geo/layer_7/attn_entropy_std": 0.7685081362724304, "geo/layer_14/stable_rank_q_proj": 52.068115234375, "geo/layer_14/stable_rank_k_proj": 42.9241828918457, "geo/layer_14/stable_rank_o_proj": 42.1667594909668, "geo/layer_14/stable_rank_gate_proj": 71.98441314697266, "geo/layer_14/stable_rank_down_proj": 127.67711639404297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38246092200279236, "geo/layer_14/attn_entropy_mean": 5.480465412139893, "geo/layer_14/attn_entropy_std": 0.4604490399360657, "geo/layer_21/stable_rank_q_proj": 38.40896987915039, "geo/layer_21/stable_rank_k_proj": 28.582902908325195, "geo/layer_21/stable_rank_o_proj": 65.18467712402344, "geo/layer_21/stable_rank_gate_proj": 60.183929443359375, "geo/layer_21/stable_rank_down_proj": 48.89564895629883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13250622153282166, "geo/layer_21/attn_entropy_mean": 5.8510332107543945, "geo/layer_21/attn_entropy_std": 0.32712841033935547, "geo/layer_27/stable_rank_q_proj": 44.6463737487793, "geo/layer_27/stable_rank_k_proj": 30.319774627685547, "geo/layer_27/stable_rank_o_proj": 107.6341781616211, "geo/layer_27/stable_rank_gate_proj": 70.17156219482422, "geo/layer_27/stable_rank_down_proj": 130.33615112304688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10480867326259613, "geo/layer_27/attn_entropy_mean": 4.301450729370117, "geo/layer_27/attn_entropy_std": 0.6844064593315125, "attnres/final_alpha/block_0": 0.26235055923461914, "attnres/block_norm/0": 1.7817258834838867, "attnres/final_alpha/block_1": 0.003823140636086464, "attnres/block_norm/1": 50693.01953125, "attnres/final_alpha/block_2": 0.008288996294140816, "attnres/block_norm/2": 29988.48828125, "attnres/final_alpha/block_3": 0.01052954513579607, "attnres/block_norm/3": 72466.7265625, "attnres/final_alpha/block_4": 0.011938927695155144, "attnres/block_norm/4": 17476.96875, "attnres/final_alpha/block_5": 0.6021909713745117, "attnres/block_norm/5": 7243.53125, "attnres/final_alpha/block_6": 0.10087788105010986, "attnres/block_norm/6": 48619.97265625, "geo/tier1_time_s": 1.3602275848388672, "geo/step": 12225.0, "geo/rankme_slope": 0.0002634683756315026} {"step": 12230, "timestamp": 1778338933.7228727, "train/loss": 2.37024781703949, "train/z_loss": 0.0013512557838112117, "train/perplexity": 10.700043608705299, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703151.2732814793, "perf/iters_per_sec": 0.8121258131415745, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2313363075256347, "data/tokens_consumed": 25650266112, "data/tokens_consumed_B": 25.650266112, "train/loss_slope": 3.585071088743391e-06} {"step": 12240, "timestamp": 1778338944.092089, "train/loss": 2.2985311269760134, "train/z_loss": 0.0013670023647136987, "train/perplexity": 9.959542402092046, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023486.4625581785, "perf/iters_per_sec": 0.9648735344687359, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364052534103394, "data/tokens_consumed": 25671237632, "data/tokens_consumed_B": 25.671237632, "train/loss_slope": 7.43847683031871e-07} {"step": 12250, "timestamp": 1778338954.4482076, "grad/layer_0/attn": 0.0028463175985962152, "grad/layer_0/mlp": 0.0034243573900312185, "grad/layer_0/attn_mlp_ratio": 0.831197562428059, "grad/layer_4/attn": 0.0026185140013694763, "grad/layer_4/mlp": 0.0026574626099318266, "grad/layer_4/attn_mlp_ratio": 0.9853436481284249, "grad/layer_8/attn": 0.005199510604143143, "grad/layer_8/mlp": 0.0035260955337435007, "grad/layer_8/attn_mlp_ratio": 1.4745801430867747, "grad/layer_12/attn": 0.006415252573788166, "grad/layer_12/mlp": 0.0068360925652086735, "grad/layer_12/attn_mlp_ratio": 0.9384385039772335, "grad/layer_16/attn": 0.005129036959260702, "grad/layer_16/mlp": 0.004496163222938776, "grad/layer_16/attn_mlp_ratio": 1.1407585959106727, "grad/layer_20/attn": 0.00329075800254941, "grad/layer_20/mlp": 0.0066216737031936646, "grad/layer_20/attn_mlp_ratio": 0.4969677003663729, "grad/layer_24/attn": 0.012051424011588097, "grad/layer_24/mlp": 0.009580140933394432, "grad/layer_24/attn_mlp_ratio": 1.2579589350072482, "grad/layer_27/attn": 0.006003932561725378, "grad/layer_27/mlp": 0.01104535162448883, "grad/layer_27/attn_mlp_ratio": 0.5435709709826588} {"step": 12250, "timestamp": 1778338954.463952, "train/loss": 2.3214654445648195, "train/z_loss": 0.0013463281327858567, "train/perplexity": 10.19059712428937, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023538.8781426777, "perf/iters_per_sec": 0.9648985281670941, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363784074783324, "data/tokens_consumed": 25692209152, "data/tokens_consumed_B": 25.692209152, "train/loss_slope": -3.21022102935084e-06} {"step": 12260, "timestamp": 1778338964.8258975, "train/loss": 2.3309486627578737, "train/z_loss": 0.0013612621114589274, "train/perplexity": 10.287696459739694, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025471.1639621372, "perf/iters_per_sec": 0.9658199138460813, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353897094726563, "data/tokens_consumed": 25713180672, "data/tokens_consumed_B": 25.713180672, "train/loss_slope": -3.6526806891733202e-06} {"step": 12270, "timestamp": 1778338975.199654, "train/loss": 2.3370861768722535, "train/z_loss": 0.0013635739218443633, "train/perplexity": 10.351031503012537, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022654.6938413598, "perf/iters_per_sec": 0.9644769162375258, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368314504623413, "data/tokens_consumed": 25734152192, "data/tokens_consumed_B": 25.734152192, "train/loss_slope": -2.463915395979881e-06} {"step": 12280, "timestamp": 1778338985.5607579, "train/loss": 2.288033294677734, "train/z_loss": 0.0013665685546584427, "train/perplexity": 9.855535673947813, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025619.7711958163, "perf/iters_per_sec": 0.9658907752970773, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353137493133544, "data/tokens_consumed": 25755123712, "data/tokens_consumed_B": 25.755123712, "train/loss_slope": -4.293617456838431e-06} {"step": 12290, "timestamp": 1778338995.915945, "train/loss": 2.3055036306381225, "train/z_loss": 0.00135544512886554, "train/perplexity": 10.029228007213751, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026170.8250813892, "perf/iters_per_sec": 0.9661535382658907, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350321769714355, "data/tokens_consumed": 25776095232, "data/tokens_consumed_B": 25.776095232, "train/loss_slope": -5.4354864092442394e-06} {"step": 12300, "timestamp": 1778339006.283517, "grad/layer_0/attn": 0.002878953004255891, "grad/layer_0/mlp": 0.002964590210467577, "grad/layer_0/attn_mlp_ratio": 0.9711132746034715, "grad/layer_4/attn": 0.001853708759881556, "grad/layer_4/mlp": 0.002707540523260832, "grad/layer_4/attn_mlp_ratio": 0.6846467026038702, "grad/layer_8/attn": 0.005886869505047798, "grad/layer_8/mlp": 0.003374785650521517, "grad/layer_8/attn_mlp_ratio": 1.7443683659438471, "grad/layer_12/attn": 0.006343970075249672, "grad/layer_12/mlp": 0.006229987367987633, "grad/layer_12/attn_mlp_ratio": 1.0182958004085452, "grad/layer_16/attn": 0.005244698841124773, "grad/layer_16/mlp": 0.004161247983574867, "grad/layer_16/attn_mlp_ratio": 1.2603667783774932, "grad/layer_20/attn": 0.004892562981694937, "grad/layer_20/mlp": 0.005259123630821705, "grad/layer_20/attn_mlp_ratio": 0.9303000332586785, "grad/layer_24/attn": 0.007826614193618298, "grad/layer_24/mlp": 0.007448010612279177, "grad/layer_24/attn_mlp_ratio": 1.050832832546137, "grad/layer_27/attn": 0.007464895024895668, "grad/layer_27/mlp": 0.007434483151882887, "grad/layer_27/attn_mlp_ratio": 1.0040906371004439} {"step": 12300, "timestamp": 1778339006.8821886, "eos/sharpness": 54.9823760986328, "eos/L0_probe": 2.315563678741455, "eos/L_plus": 2.6588025093078613, "eos/L_minus": 2.522148609161377, "eos/grad_norm": 0.13990198075771332, "eos/embed_grad_frac": 0.12347108125686646, "eos/time_s": 0.5958387851715088} {"step": 12300, "timestamp": 1778339006.9042337, "train/loss": 2.36193208694458, "train/z_loss": 0.0013570254668593407, "train/perplexity": 10.611433872001758, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909430.6080417177, "perf/iters_per_sec": 0.9104874649246777, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.098312759399414, "data/tokens_consumed": 25797066752, "data/tokens_consumed_B": 25.797066752, "train/loss_slope": -3.8094374355953564e-06} {"step": 12300, "timestamp": 1778339008.2692788, "geo/rankme_last": 430.4736328125, "geo/layer_0/stable_rank_q_proj": 20.625486373901367, "geo/layer_0/stable_rank_k_proj": 16.914222717285156, "geo/layer_0/stable_rank_o_proj": 43.89975357055664, "geo/layer_0/stable_rank_gate_proj": 125.6828384399414, "geo/layer_0/stable_rank_down_proj": 57.644935607910156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06213688477873802, "geo/layer_0/attn_entropy_mean": 6.228427410125732, "geo/layer_0/attn_entropy_std": 0.45623907446861267, "geo/layer_7/stable_rank_q_proj": 41.8768196105957, "geo/layer_7/stable_rank_k_proj": 38.88040542602539, "geo/layer_7/stable_rank_o_proj": 88.65054321289062, "geo/layer_7/stable_rank_gate_proj": 78.36100006103516, "geo/layer_7/stable_rank_down_proj": 144.68490600585938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4038938581943512, "geo/layer_7/attn_entropy_mean": 4.743049621582031, "geo/layer_7/attn_entropy_std": 0.7724077105522156, "geo/layer_14/stable_rank_q_proj": 52.053062438964844, "geo/layer_14/stable_rank_k_proj": 43.0767707824707, "geo/layer_14/stable_rank_o_proj": 42.19012451171875, "geo/layer_14/stable_rank_gate_proj": 71.96156311035156, "geo/layer_14/stable_rank_down_proj": 127.58061218261719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3873997628688812, "geo/layer_14/attn_entropy_mean": 5.498153209686279, "geo/layer_14/attn_entropy_std": 0.48160967230796814, "geo/layer_21/stable_rank_q_proj": 38.3874397277832, "geo/layer_21/stable_rank_k_proj": 28.57967185974121, "geo/layer_21/stable_rank_o_proj": 65.17406463623047, "geo/layer_21/stable_rank_gate_proj": 60.135276794433594, "geo/layer_21/stable_rank_down_proj": 48.88627624511719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13825483620166779, "geo/layer_21/attn_entropy_mean": 5.839594841003418, "geo/layer_21/attn_entropy_std": 0.326641321182251, "geo/layer_27/stable_rank_q_proj": 44.66006088256836, "geo/layer_27/stable_rank_k_proj": 30.336963653564453, "geo/layer_27/stable_rank_o_proj": 107.59001159667969, "geo/layer_27/stable_rank_gate_proj": 70.15691375732422, "geo/layer_27/stable_rank_down_proj": 130.35906982421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09643848240375519, "geo/layer_27/attn_entropy_mean": 4.323901176452637, "geo/layer_27/attn_entropy_std": 0.6808294057846069, "attnres/final_alpha/block_0": 0.2612905502319336, "attnres/block_norm/0": 1.7818198204040527, "attnres/final_alpha/block_1": 0.0038302226457744837, "attnres/block_norm/1": 50796.46875, "attnres/final_alpha/block_2": 0.008296294137835503, "attnres/block_norm/2": 30040.97265625, "attnres/final_alpha/block_3": 0.01054816972464323, "attnres/block_norm/3": 72416.765625, "attnres/final_alpha/block_4": 0.011792679317295551, "attnres/block_norm/4": 17515.091796875, "attnres/final_alpha/block_5": 0.6049185991287231, "attnres/block_norm/5": 7272.2236328125, "attnres/final_alpha/block_6": 0.09932348877191544, "attnres/block_norm/6": 48840.22265625, "geo/tier1_time_s": 1.3616676330566406, "geo/step": 12300.0, "geo/rankme_slope": 0.00026983541463460384} {"step": 12310, "timestamp": 1778339018.6368241, "train/loss": 2.3675196409225463, "train/z_loss": 0.0013525024405680597, "train/perplexity": 10.670891789012181, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787956.500797173, "perf/iters_per_sec": 0.8525640968309274, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1729323387145996, "data/tokens_consumed": 25818038272, "data/tokens_consumed_B": 25.818038272, "train/loss_slope": -1.0435647017384224e-06} {"step": 12320, "timestamp": 1778339029.0086102, "train/loss": 2.3574907779693604, "train/z_loss": 0.001355320552829653, "train/perplexity": 10.56440971720328, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023179.0993475087, "perf/iters_per_sec": 0.964726972268824, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036562705039978, "data/tokens_consumed": 25839009792, "data/tokens_consumed_B": 25.839009792, "train/loss_slope": -1.2515480941099347e-06} {"step": 12330, "timestamp": 1778339039.3857386, "train/loss": 2.3014455318450926, "train/z_loss": 0.0013650412671267985, "train/perplexity": 9.988610879042843, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021847.3099424439, "perf/iters_per_sec": 0.9640919255935878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372454881668092, "data/tokens_consumed": 25859981312, "data/tokens_consumed_B": 25.859981312, "train/loss_slope": -3.746815983897873e-06} {"step": 12340, "timestamp": 1778339049.7571087, "train/loss": 2.350840139389038, "train/z_loss": 0.0013696881476789712, "train/perplexity": 10.494382766443184, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023432.4205581136, "perf/iters_per_sec": 0.9648477652350014, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036432933807373, "data/tokens_consumed": 25880952832, "data/tokens_consumed_B": 25.880952832, "train/loss_slope": -4.214559029145103e-06} {"step": 12350, "timestamp": 1778339060.1143045, "grad/layer_0/attn": 0.0032106402795761824, "grad/layer_0/mlp": 0.0032959056552499533, "grad/layer_0/attn_mlp_ratio": 0.9741298805228414, "grad/layer_4/attn": 0.0018232709262520075, "grad/layer_4/mlp": 0.0025204757694154978, "grad/layer_4/attn_mlp_ratio": 0.723383607189553, "grad/layer_8/attn": 0.00278228591196239, "grad/layer_8/mlp": 0.003290697932243347, "grad/layer_8/attn_mlp_ratio": 0.8455002205309126, "grad/layer_12/attn": 0.005861139856278896, "grad/layer_12/mlp": 0.006271926686167717, "grad/layer_12/attn_mlp_ratio": 0.9345038703585027, "grad/layer_16/attn": 0.0034110352862626314, "grad/layer_16/mlp": 0.0045295278541743755, "grad/layer_16/attn_mlp_ratio": 0.7530663947264196, "grad/layer_20/attn": 0.0026499556843191385, "grad/layer_20/mlp": 0.005957693327218294, "grad/layer_20/attn_mlp_ratio": 0.44479557679362314, "grad/layer_24/attn": 0.009612798690795898, "grad/layer_24/mlp": 0.010158258490264416, "grad/layer_24/attn_mlp_ratio": 0.946303798567278, "grad/layer_27/attn": 0.006641154643148184, "grad/layer_27/mlp": 0.010474342852830887, "grad/layer_27/attn_mlp_ratio": 0.6340402136014925} {"step": 12350, "timestamp": 1778339060.1297746, "train/loss": 2.3240700006484984, "train/z_loss": 0.0013591250986792146, "train/perplexity": 10.217173701094172, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022755.7668299617, "perf/iters_per_sec": 0.9645251115941819, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367796421051025, "data/tokens_consumed": 25901924352, "data/tokens_consumed_B": 25.901924352, "train/loss_slope": -5.9616850690730184e-06} {"step": 12360, "timestamp": 1778339070.4987047, "train/loss": 2.356953167915344, "train/z_loss": 0.0013656767085194587, "train/perplexity": 10.558731710738002, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023569.183596012, "perf/iters_per_sec": 0.96491297893334, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036362886428833, "data/tokens_consumed": 25922895872, "data/tokens_consumed_B": 25.922895872, "train/loss_slope": -7.202672443338402e-06} {"step": 12370, "timestamp": 1778339080.873094, "train/loss": 2.3538384437561035, "train/z_loss": 0.0013525013928301632, "train/perplexity": 10.525895338655193, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022857.129040937, "perf/iters_per_sec": 0.9645734448628125, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367276906967162, "data/tokens_consumed": 25943867392, "data/tokens_consumed_B": 25.943867392, "train/loss_slope": -4.573089426214054e-06} {"step": 12375, "timestamp": 1778339086.639012, "eos/sharpness": 35.90855598449706, "eos/L0_probe": 2.3168373107910156, "eos/L_plus": 2.513390302658081, "eos/L_minus": 2.479369878768921, "eos/grad_norm": 0.150740385055542, "eos/embed_grad_frac": 0.10097523778676987, "eos/time_s": 0.5944645404815674} {"step": 12375, "timestamp": 1778339088.0242686, "geo/rankme_last": 428.8563232421875, "geo/layer_0/stable_rank_q_proj": 20.62620735168457, "geo/layer_0/stable_rank_k_proj": 16.914291381835938, "geo/layer_0/stable_rank_o_proj": 43.88115310668945, "geo/layer_0/stable_rank_gate_proj": 125.39032745361328, "geo/layer_0/stable_rank_down_proj": 57.6242790222168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.072132907807827, "geo/layer_0/attn_entropy_mean": 6.239217281341553, "geo/layer_0/attn_entropy_std": 0.45302292704582214, "geo/layer_7/stable_rank_q_proj": 41.87841033935547, "geo/layer_7/stable_rank_k_proj": 38.75927734375, "geo/layer_7/stable_rank_o_proj": 88.64153289794922, "geo/layer_7/stable_rank_gate_proj": 78.37429809570312, "geo/layer_7/stable_rank_down_proj": 144.7280731201172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4019535779953003, "geo/layer_7/attn_entropy_mean": 4.701938629150391, "geo/layer_7/attn_entropy_std": 0.7712305784225464, "geo/layer_14/stable_rank_q_proj": 52.149192810058594, "geo/layer_14/stable_rank_k_proj": 43.06495666503906, "geo/layer_14/stable_rank_o_proj": 42.09355163574219, "geo/layer_14/stable_rank_gate_proj": 71.91342163085938, "geo/layer_14/stable_rank_down_proj": 127.73284912109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38621798157691956, "geo/layer_14/attn_entropy_mean": 5.503537178039551, "geo/layer_14/attn_entropy_std": 0.4664747714996338, "geo/layer_21/stable_rank_q_proj": 38.30146408081055, "geo/layer_21/stable_rank_k_proj": 28.603384017944336, "geo/layer_21/stable_rank_o_proj": 65.14331817626953, "geo/layer_21/stable_rank_gate_proj": 60.16978454589844, "geo/layer_21/stable_rank_down_proj": 48.83582305908203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13671550154685974, "geo/layer_21/attn_entropy_mean": 5.835827827453613, "geo/layer_21/attn_entropy_std": 0.32588130235671997, "geo/layer_27/stable_rank_q_proj": 44.690250396728516, "geo/layer_27/stable_rank_k_proj": 30.23954963684082, "geo/layer_27/stable_rank_o_proj": 107.47528076171875, "geo/layer_27/stable_rank_gate_proj": 70.07331848144531, "geo/layer_27/stable_rank_down_proj": 130.1358184814453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1142895445227623, "geo/layer_27/attn_entropy_mean": 4.289328575134277, "geo/layer_27/attn_entropy_std": 0.6755548119544983, "attnres/final_alpha/block_0": 0.26143795251846313, "attnres/block_norm/0": 1.7816228866577148, "attnres/final_alpha/block_1": 0.003796311328187585, "attnres/block_norm/1": 50772.4765625, "attnres/final_alpha/block_2": 0.008326752111315727, "attnres/block_norm/2": 29929.873046875, "attnres/final_alpha/block_3": 0.010487409308552742, "attnres/block_norm/3": 72599.265625, "attnres/final_alpha/block_4": 0.011869717389345169, "attnres/block_norm/4": 17440.4296875, "attnres/final_alpha/block_5": 0.6051868200302124, "attnres/block_norm/5": 7212.0126953125, "attnres/final_alpha/block_6": 0.09889506548643112, "attnres/block_norm/6": 48982.71484375, "geo/tier1_time_s": 1.366030216217041, "geo/step": 12375.0, "geo/rankme_slope": 0.00020778158919817927} {"step": 12380, "timestamp": 1778339093.2104182, "train/loss": 2.339212989807129, "train/z_loss": 0.001349362381733954, "train/perplexity": 10.373069637890456, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700541.4555293876, "perf/iters_per_sec": 0.810881355061239, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2332260370254517, "data/tokens_consumed": 25964838912, "data/tokens_consumed_B": 25.964838912, "train/loss_slope": -4.456587807752328e-06} {"step": 12390, "timestamp": 1778339103.5833735, "train/loss": 2.344762182235718, "train/z_loss": 0.0013561258325353264, "train/perplexity": 10.430791804970127, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023204.8334781565, "perf/iters_per_sec": 0.9647392432585509, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365495204925537, "data/tokens_consumed": 25985810432, "data/tokens_consumed_B": 25.985810432, "train/loss_slope": -3.2952530978500654e-06} {"step": 12400, "timestamp": 1778339113.950498, "grad/layer_0/attn": 0.0028954893350601196, "grad/layer_0/mlp": 0.0032040439546108246, "grad/layer_0/attn_mlp_ratio": 0.9036983529902858, "grad/layer_4/attn": 0.00167638820130378, "grad/layer_4/mlp": 0.0025114258751273155, "grad/layer_4/attn_mlp_ratio": 0.6675045244838628, "grad/layer_8/attn": 0.003223464824259281, "grad/layer_8/mlp": 0.0034625199623405933, "grad/layer_8/attn_mlp_ratio": 0.930959176040206, "grad/layer_12/attn": 0.006199750583618879, "grad/layer_12/mlp": 0.007102816831320524, "grad/layer_12/attn_mlp_ratio": 0.8728580003632798, "grad/layer_16/attn": 0.0037745563313364983, "grad/layer_16/mlp": 0.004874553065747023, "grad/layer_16/attn_mlp_ratio": 0.7743389399996522, "grad/layer_20/attn": 0.0031219993252307177, "grad/layer_20/mlp": 0.006872240919619799, "grad/layer_20/attn_mlp_ratio": 0.45429130269407225, "grad/layer_24/attn": 0.012601806782186031, "grad/layer_24/mlp": 0.013048490509390831, "grad/layer_24/attn_mlp_ratio": 0.9657673948216412, "grad/layer_27/attn": 0.00744463037699461, "grad/layer_27/mlp": 0.012480554170906544, "grad/layer_27/attn_mlp_ratio": 0.5964983778283637} {"step": 12400, "timestamp": 1778339113.9659345, "train/loss": 2.2999885082244873, "train/z_loss": 0.0013632311369292439, "train/perplexity": 9.974067834407625, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021227.355549549, "perf/iters_per_sec": 0.9637963083026643, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375636339187622, "data/tokens_consumed": 26006781952, "data/tokens_consumed_B": 26.006781952, "train/loss_slope": -6.6161134097799885e-06} {"step": 12410, "timestamp": 1778339124.3328931, "train/loss": 2.361526131629944, "train/z_loss": 0.0013575444812886418, "train/perplexity": 10.607126978287853, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023886.537603465, "perf/iters_per_sec": 0.9650643051163983, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362003803253175, "data/tokens_consumed": 26027753472, "data/tokens_consumed_B": 26.027753472, "train/loss_slope": -4.818686357390003e-06} {"step": 12420, "timestamp": 1778339134.7024474, "train/loss": 2.311599588394165, "train/z_loss": 0.001370187452994287, "train/perplexity": 10.090552483274259, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023436.51666373, "perf/iters_per_sec": 0.9648497184103632, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036430835723877, "data/tokens_consumed": 26048724992, "data/tokens_consumed_B": 26.048724992, "train/loss_slope": -4.200832070511723e-06} {"step": 12430, "timestamp": 1778339145.0794785, "train/loss": 2.309471273422241, "train/z_loss": 0.0013729659374803304, "train/perplexity": 10.06909944685645, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022286.2560083407, "perf/iters_per_sec": 0.9643012313882545, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370203495025634, "data/tokens_consumed": 26069696512, "data/tokens_consumed_B": 26.069696512, "train/loss_slope": -6.770975025358975e-06} {"step": 12440, "timestamp": 1778339155.4478338, "train/loss": 2.313059759140015, "train/z_loss": 0.001367248222231865, "train/perplexity": 10.105297175083853, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023613.3166630096, "perf/iters_per_sec": 0.96493402321959, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363402843475342, "data/tokens_consumed": 26090668032, "data/tokens_consumed_B": 26.090668032, "train/loss_slope": -7.829763994465828e-06} {"step": 12450, "timestamp": 1778339165.8048327, "grad/layer_0/attn": 0.0031723924912512302, "grad/layer_0/mlp": 0.00354951829649508, "grad/layer_0/attn_mlp_ratio": 0.8937529368445485, "grad/layer_4/attn": 0.0025893626734614372, "grad/layer_4/mlp": 0.00283231888897717, "grad/layer_4/attn_mlp_ratio": 0.9142200025981285, "grad/layer_8/attn": 0.0038772898260504007, "grad/layer_8/mlp": 0.003693484468385577, "grad/layer_8/attn_mlp_ratio": 1.04976472874372, "grad/layer_12/attn": 0.00933161936700344, "grad/layer_12/mlp": 0.007124837022274733, "grad/layer_12/attn_mlp_ratio": 1.3097309042798375, "grad/layer_16/attn": 0.003484022803604603, "grad/layer_16/mlp": 0.0047776298597455025, "grad/layer_16/attn_mlp_ratio": 0.729236636775903, "grad/layer_20/attn": 0.004151975270360708, "grad/layer_20/mlp": 0.00641871290281415, "grad/layer_20/attn_mlp_ratio": 0.6468547929375191, "grad/layer_24/attn": 0.011699997819960117, "grad/layer_24/mlp": 0.01040340680629015, "grad/layer_24/attn_mlp_ratio": 1.1246313756012003, "grad/layer_27/attn": 0.009873777627944946, "grad/layer_27/mlp": 0.00996762327849865, "grad/layer_27/attn_mlp_ratio": 0.9905849421682466} {"step": 12450, "timestamp": 1778339166.4036288, "eos/sharpness": 62.982225418090806, "eos/L0_probe": 2.3122003078460693, "eos/L_plus": 2.5864572525024414, "eos/L_minus": 2.6677656173706055, "eos/grad_norm": 0.1835312396287918, "eos/embed_grad_frac": 0.06975888460874557, "eos/time_s": 0.5960292816162109} {"step": 12450, "timestamp": 1778339166.4228265, "train/loss": 2.3356157302856446, "train/z_loss": 0.0013711548061110079, "train/perplexity": 10.335822049156311, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912135.0253494338, "perf/iters_per_sec": 0.9117770315882844, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967593669891358, "data/tokens_consumed": 26111639552, "data/tokens_consumed_B": 26.111639552, "train/loss_slope": -9.953872899268037e-06} {"step": 12450, "timestamp": 1778339167.7885835, "geo/rankme_last": 429.64556884765625, "geo/layer_0/stable_rank_q_proj": 20.632221221923828, "geo/layer_0/stable_rank_k_proj": 16.932415008544922, "geo/layer_0/stable_rank_o_proj": 43.91993713378906, "geo/layer_0/stable_rank_gate_proj": 125.5850830078125, "geo/layer_0/stable_rank_down_proj": 57.64485549926758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0645572766661644, "geo/layer_0/attn_entropy_mean": 6.240983963012695, "geo/layer_0/attn_entropy_std": 0.453339159488678, "geo/layer_7/stable_rank_q_proj": 41.833396911621094, "geo/layer_7/stable_rank_k_proj": 38.7030143737793, "geo/layer_7/stable_rank_o_proj": 88.56470489501953, "geo/layer_7/stable_rank_gate_proj": 78.40735626220703, "geo/layer_7/stable_rank_down_proj": 144.6032257080078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4014512598514557, "geo/layer_7/attn_entropy_mean": 4.733341217041016, "geo/layer_7/attn_entropy_std": 0.7669654488563538, "geo/layer_14/stable_rank_q_proj": 52.02100372314453, "geo/layer_14/stable_rank_k_proj": 43.13561248779297, "geo/layer_14/stable_rank_o_proj": 42.081363677978516, "geo/layer_14/stable_rank_gate_proj": 72.04138946533203, "geo/layer_14/stable_rank_down_proj": 127.63681030273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37786027789115906, "geo/layer_14/attn_entropy_mean": 5.498805999755859, "geo/layer_14/attn_entropy_std": 0.4940336346626282, "geo/layer_21/stable_rank_q_proj": 38.265384674072266, "geo/layer_21/stable_rank_k_proj": 28.66928482055664, "geo/layer_21/stable_rank_o_proj": 65.20317840576172, "geo/layer_21/stable_rank_gate_proj": 59.98502731323242, "geo/layer_21/stable_rank_down_proj": 48.75477600097656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14335772395133972, "geo/layer_21/attn_entropy_mean": 5.842453956604004, "geo/layer_21/attn_entropy_std": 0.3313857614994049, "geo/layer_27/stable_rank_q_proj": 44.619258880615234, "geo/layer_27/stable_rank_k_proj": 30.299211502075195, "geo/layer_27/stable_rank_o_proj": 107.37991333007812, "geo/layer_27/stable_rank_gate_proj": 70.07128143310547, "geo/layer_27/stable_rank_down_proj": 130.01712036132812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10459829866886139, "geo/layer_27/attn_entropy_mean": 4.288277626037598, "geo/layer_27/attn_entropy_std": 0.6780113577842712, "attnres/final_alpha/block_0": 0.2624873220920563, "attnres/block_norm/0": 1.7819774150848389, "attnres/final_alpha/block_1": 0.0038342224434018135, "attnres/block_norm/1": 50809.078125, "attnres/final_alpha/block_2": 0.008438041433691978, "attnres/block_norm/2": 29963.529296875, "attnres/final_alpha/block_3": 0.010744527913630009, "attnres/block_norm/3": 72312.3046875, "attnres/final_alpha/block_4": 0.012055352330207825, "attnres/block_norm/4": 17553.140625, "attnres/final_alpha/block_5": 0.6015350818634033, "attnres/block_norm/5": 7299.1376953125, "attnres/final_alpha/block_6": 0.1009054183959961, "attnres/block_norm/6": 49088.1875, "geo/tier1_time_s": 1.3616180419921875, "geo/step": 12450.0, "geo/rankme_slope": 0.00021190755599114646} {"step": 12460, "timestamp": 1778339178.1606977, "train/loss": 2.297231578826904, "train/z_loss": 0.001359579397831112, "train/perplexity": 9.946607903519615, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787248.9955533203, "perf/iters_per_sec": 0.8522267320410348, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1733966588973999, "data/tokens_consumed": 26132611072, "data/tokens_consumed_B": 26.132611072, "train/loss_slope": -1.1028292498858915e-05} {"step": 12470, "timestamp": 1778339188.532326, "train/loss": 2.3588207721710206, "train/z_loss": 0.0013665351318195463, "train/perplexity": 10.57846966862552, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023338.447417001, "perf/iters_per_sec": 0.9648029553494458, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364810705184937, "data/tokens_consumed": 26153582592, "data/tokens_consumed_B": 26.153582592, "train/loss_slope": -8.862057413169241e-06} {"step": 12480, "timestamp": 1778339198.902873, "train/loss": 2.3675456285476684, "train/z_loss": 0.001359524007420987, "train/perplexity": 10.671169103751073, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023519.466370709, "perf/iters_per_sec": 0.9648892719129128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036388349533081, "data/tokens_consumed": 26174554112, "data/tokens_consumed_B": 26.174554112, "train/loss_slope": -7.77999830431952e-06} {"step": 12490, "timestamp": 1778339209.2820313, "train/loss": 2.3389025688171388, "train/z_loss": 0.0013563829008489848, "train/perplexity": 10.369850119073186, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021782.0165559119, "perf/iters_per_sec": 0.9640607912807044, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037278985977173, "data/tokens_consumed": 26195525632, "data/tokens_consumed_B": 26.195525632, "train/loss_slope": -5.1844849230253345e-06} {"step": 12500, "timestamp": 1778339219.6387835, "grad/layer_0/attn": 0.003105154260993004, "grad/layer_0/mlp": 0.003204215317964554, "grad/layer_0/attn_mlp_ratio": 0.9690841145023646, "grad/layer_4/attn": 0.0026275087147951126, "grad/layer_4/mlp": 0.002543063135817647, "grad/layer_4/attn_mlp_ratio": 1.033206204937454, "grad/layer_8/attn": 0.003917572554200888, "grad/layer_8/mlp": 0.0035083622206002474, "grad/layer_8/attn_mlp_ratio": 1.1166385327985817, "grad/layer_12/attn": 0.004693691153079271, "grad/layer_12/mlp": 0.006572758313268423, "grad/layer_12/attn_mlp_ratio": 0.7141128363403895, "grad/layer_16/attn": 0.004489624872803688, "grad/layer_16/mlp": 0.004471279680728912, "grad/layer_16/attn_mlp_ratio": 1.004102872773438, "grad/layer_20/attn": 0.002959970850497484, "grad/layer_20/mlp": 0.0063068680465221405, "grad/layer_20/attn_mlp_ratio": 0.4693249932820826, "grad/layer_24/attn": 0.017963500693440437, "grad/layer_24/mlp": 0.013692579232156277, "grad/layer_24/attn_mlp_ratio": 1.311915034974757, "grad/layer_27/attn": 0.009924648329615593, "grad/layer_27/mlp": 0.013721059076488018, "grad/layer_27/attn_mlp_ratio": 0.723315029981225} {"step": 12500, "timestamp": 1778339219.6546922, "train/loss": 2.3163897514343263, "train/z_loss": 0.0013548657880164683, "train/perplexity": 10.139003827107038, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023048.8565290358, "perf/iters_per_sec": 0.964664867653387, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366294384002686, "data/tokens_consumed": 26216497152, "data/tokens_consumed_B": 26.216497152, "train/loss_slope": -5.962596438934608e-06} {"step": 12500, "timestamp": 1778339226.7465436, "geo/ww_alpha_mean": 7.66596836243083, "geo/ww_alpha_std": 5.050803732140009, "geo/ww_alpha_min": 1.343585495812115, "geo/ww_alpha_max": 46.88287499163543, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.94453846070073, "geo/ww_alpha_by_type/k_proj": 4.430617721917714, "geo/ww_alpha_by_type/v_proj": 9.352483517684755, "geo/ww_alpha_by_type/o_proj": 9.48405096660383, "geo/ww_alpha_by_type/gate_proj": 7.661094704902853, "geo/ww_alpha_by_type/up_proj": 11.130521742441358, "geo/ww_alpha_by_type/down_proj": 7.756917720844383, "geo/twonn_id/layer_0": 0.726718008518219, "geo/twonn_id/layer_7": 3.283278465270996, "geo/twonn_id/layer_14": 5.554690361022949, "geo/twonn_id/layer_21": 6.451639652252197, "geo/twonn_id/layer_27": 6.4109601974487305, "geo/tier2_time_s": 7.082078218460083} {"step": 12500, "timestamp": 1778339227.533289, "eoc/jacobian_sigma/layer_0/attn": 1440.709716796875, "eoc/jacobian_sigma/layer_0/mlp": 10473.08984375, "eoc/jacobian_sigma/layer_0": 10473.08984375, "eoc/jacobian_sigma/layer_7/attn": 1.1533641815185547, "eoc/jacobian_sigma/layer_7/mlp": 1.8103057146072388, "eoc/jacobian_sigma/layer_7": 1.8103057146072388, "eoc/jacobian_sigma/layer_14/attn": 2.4329018592834473, "eoc/jacobian_sigma/layer_14/mlp": 12.523659706115723, "eoc/jacobian_sigma/layer_14": 12.523659706115723, "eoc/jacobian_sigma/layer_21/attn": 1.0958528518676758, "eoc/jacobian_sigma/layer_21/mlp": 5.9791975021362305, "eoc/jacobian_sigma/layer_21": 5.9791975021362305, "eoc/jacobian_sigma/layer_27/attn": 4.041937828063965, "eoc/jacobian_sigma/layer_27/mlp": 40.69707489013672, "eoc/jacobian_sigma/layer_27": 40.69707489013672, "eoc/layer0_sigma": 10473.08984375, "eoc/sigma_max": 40.69707489013672, "eoc/sigma_min": 1.8103057146072388, "eoc/sigma_mean": 15.252559453248978, "eoc/time_s": 0.779632568359375} {"step": 12510, "timestamp": 1778339237.915818, "train/loss": 2.3394049167633058, "train/z_loss": 0.001362459734082222, "train/perplexity": 10.37506070063546, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1148944.6556244623, "perf/iters_per_sec": 0.5478595045206367, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8252854824066163, "data/tokens_consumed": 26237468672, "data/tokens_consumed_B": 26.237468672, "train/loss_slope": -5.01386323133955e-06} {"step": 12520, "timestamp": 1778339248.854625, "train/loss": 2.3028405904769897, "train/z_loss": 0.0013809381634928286, "train/perplexity": 10.002555301252059, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918335.3941062363, "perf/iters_per_sec": 0.9147335978060895, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.093214464187622, "data/tokens_consumed": 26258440192, "data/tokens_consumed_B": 26.258440192, "train/loss_slope": -8.152388682280954e-06} {"step": 12525, "timestamp": 1778339254.6563954, "eos/sharpness": 40.034818649291985, "eos/L0_probe": 2.3124992847442627, "eos/L_plus": 2.493133068084717, "eos/L_minus": 2.5322136878967285, "eos/grad_norm": 0.12117737531661987, "eos/embed_grad_frac": 0.1475805640220642, "eos/time_s": 0.6284170150756836} {"step": 12525, "timestamp": 1778339256.038532, "geo/rankme_last": 430.932861328125, "geo/layer_0/stable_rank_q_proj": 20.622220993041992, "geo/layer_0/stable_rank_k_proj": 16.954105377197266, "geo/layer_0/stable_rank_o_proj": 43.923744201660156, "geo/layer_0/stable_rank_gate_proj": 125.29270935058594, "geo/layer_0/stable_rank_down_proj": 57.63274002075195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06510435044765472, "geo/layer_0/attn_entropy_mean": 6.2401533126831055, "geo/layer_0/attn_entropy_std": 0.4482104778289795, "geo/layer_7/stable_rank_q_proj": 41.772945404052734, "geo/layer_7/stable_rank_k_proj": 38.83173751831055, "geo/layer_7/stable_rank_o_proj": 88.58110046386719, "geo/layer_7/stable_rank_gate_proj": 78.3335952758789, "geo/layer_7/stable_rank_down_proj": 144.64276123046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3996347188949585, "geo/layer_7/attn_entropy_mean": 4.699823379516602, "geo/layer_7/attn_entropy_std": 0.7640619874000549, "geo/layer_14/stable_rank_q_proj": 52.03683853149414, "geo/layer_14/stable_rank_k_proj": 43.264774322509766, "geo/layer_14/stable_rank_o_proj": 42.14485168457031, "geo/layer_14/stable_rank_gate_proj": 72.03563690185547, "geo/layer_14/stable_rank_down_proj": 127.76773071289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38045015931129456, "geo/layer_14/attn_entropy_mean": 5.568356513977051, "geo/layer_14/attn_entropy_std": 0.4506691098213196, "geo/layer_21/stable_rank_q_proj": 38.357086181640625, "geo/layer_21/stable_rank_k_proj": 28.670751571655273, "geo/layer_21/stable_rank_o_proj": 65.27735900878906, "geo/layer_21/stable_rank_gate_proj": 59.89598083496094, "geo/layer_21/stable_rank_down_proj": 48.68916320800781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1415710151195526, "geo/layer_21/attn_entropy_mean": 5.8369526863098145, "geo/layer_21/attn_entropy_std": 0.3257446587085724, "geo/layer_27/stable_rank_q_proj": 44.619102478027344, "geo/layer_27/stable_rank_k_proj": 30.3262939453125, "geo/layer_27/stable_rank_o_proj": 107.61775970458984, "geo/layer_27/stable_rank_gate_proj": 70.05220031738281, "geo/layer_27/stable_rank_down_proj": 129.83377075195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1015789583325386, "geo/layer_27/attn_entropy_mean": 4.308245658874512, "geo/layer_27/attn_entropy_std": 0.6645992398262024, "attnres/final_alpha/block_0": 0.2627815008163452, "attnres/block_norm/0": 1.781966209411621, "attnres/final_alpha/block_1": 0.003884769743308425, "attnres/block_norm/1": 50716.484375, "attnres/final_alpha/block_2": 0.008210700005292892, "attnres/block_norm/2": 30005.6640625, "attnres/final_alpha/block_3": 0.010679276660084724, "attnres/block_norm/3": 72166.140625, "attnres/final_alpha/block_4": 0.011739026755094528, "attnres/block_norm/4": 17569.8046875, "attnres/final_alpha/block_5": 0.6026961803436279, "attnres/block_norm/5": 7279.83203125, "attnres/final_alpha/block_6": 0.10000849515199661, "attnres/block_norm/6": 48591.671875, "geo/tier1_time_s": 1.3619768619537354, "geo/step": 12525.0, "geo/rankme_slope": 0.0002479117623611945} {"step": 12530, "timestamp": 1778339261.2216105, "train/loss": 2.3232521772384644, "train/z_loss": 0.0013717240770347416, "train/perplexity": 10.208821273128128, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696830.491167431, "perf/iters_per_sec": 0.8091118293606906, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2359230995178223, "data/tokens_consumed": 26279411712, "data/tokens_consumed_B": 26.279411712, "train/loss_slope": -6.937037507156425e-06} {"step": 12540, "timestamp": 1778339271.573809, "train/loss": 2.3670104503631593, "train/z_loss": 0.0013565568835474551, "train/perplexity": 10.665459654766082, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026844.8620378668, "perf/iters_per_sec": 0.9664749441327414, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346879720687867, "data/tokens_consumed": 26300383232, "data/tokens_consumed_B": 26.300383232, "train/loss_slope": -6.147685882174595e-06} {"step": 12550, "timestamp": 1778339281.924782, "grad/layer_0/attn": 0.003230246715247631, "grad/layer_0/mlp": 0.0035991016775369644, "grad/layer_0/attn_mlp_ratio": 0.8975146897507971, "grad/layer_4/attn": 0.002311304910108447, "grad/layer_4/mlp": 0.002572985365986824, "grad/layer_4/attn_mlp_ratio": 0.8982969164273868, "grad/layer_8/attn": 0.004124284256249666, "grad/layer_8/mlp": 0.0035044720862060785, "grad/layer_8/attn_mlp_ratio": 1.1768631728575916, "grad/layer_12/attn": 0.009283512830734253, "grad/layer_12/mlp": 0.007890764623880386, "grad/layer_12/attn_mlp_ratio": 1.1765035653184404, "grad/layer_16/attn": 0.004526286385953426, "grad/layer_16/mlp": 0.005795316305011511, "grad/layer_16/attn_mlp_ratio": 0.7810248948684341, "grad/layer_20/attn": 0.007183460518717766, "grad/layer_20/mlp": 0.007291160989552736, "grad/layer_20/attn_mlp_ratio": 0.9852286118065213, "grad/layer_24/attn": 0.015072057023644447, "grad/layer_24/mlp": 0.01313224621117115, "grad/layer_24/attn_mlp_ratio": 1.1477135492671324, "grad/layer_27/attn": 0.013299262151122093, "grad/layer_27/mlp": 0.013565526343882084, "grad/layer_27/attn_mlp_ratio": 0.9803719897004017} {"step": 12550, "timestamp": 1778339281.94098, "train/loss": 2.3490057706832888, "train/z_loss": 0.0013534745434299112, "train/perplexity": 10.475149844638292, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023881.2289274454, "perf/iters_per_sec": 0.9650617737424113, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036203098297119, "data/tokens_consumed": 26321354752, "data/tokens_consumed_B": 26.321354752, "train/loss_slope": -6.791189186858396e-06} {"step": 12560, "timestamp": 1778339292.7247334, "train/loss": 2.348125433921814, "train/z_loss": 0.0013623582082800568, "train/perplexity": 10.465932243040168, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946452.7186514225, "perf/iters_per_sec": 0.9281409829384911, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.077422523498535, "data/tokens_consumed": 26342326272, "data/tokens_consumed_B": 26.342326272, "train/loss_slope": -7.952557889112526e-06} {"step": 12570, "timestamp": 1778339303.0873332, "train/loss": 2.3586827754974364, "train/z_loss": 0.0013562996056862176, "train/perplexity": 10.577009975718338, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025139.4184632835, "perf/iters_per_sec": 0.9656617252651613, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035559320449829, "data/tokens_consumed": 26363297792, "data/tokens_consumed_B": 26.363297792, "train/loss_slope": -6.759837050713993e-06} {"step": 12580, "timestamp": 1778339313.4494927, "train/loss": 2.394522023200989, "train/z_loss": 0.0013515092781744898, "train/perplexity": 10.96295676758887, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024886.3688899712, "perf/iters_per_sec": 0.9655410618257385, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356887340545655, "data/tokens_consumed": 26384269312, "data/tokens_consumed_B": 26.384269312, "train/loss_slope": -1.202464826179206e-06} {"step": 12590, "timestamp": 1778339323.8088858, "train/loss": 2.2911067962646485, "train/z_loss": 0.001366436027456075, "train/perplexity": 9.885873275933719, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025950.8808400854, "perf/iters_per_sec": 0.9660486606789043, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351445436477662, "data/tokens_consumed": 26405240832, "data/tokens_consumed_B": 26.405240832, "train/loss_slope": -3.4779021305755382e-06} {"step": 12600, "timestamp": 1778339334.1607072, "grad/layer_0/attn": 0.003167081158608198, "grad/layer_0/mlp": 0.0031547180842608213, "grad/layer_0/attn_mlp_ratio": 1.003918883914594, "grad/layer_4/attn": 0.0020832912996411324, "grad/layer_4/mlp": 0.0025519593618810177, "grad/layer_4/attn_mlp_ratio": 0.8163496837467646, "grad/layer_8/attn": 0.006476463284343481, "grad/layer_8/mlp": 0.0033594693522900343, "grad/layer_8/attn_mlp_ratio": 1.9278232400442574, "grad/layer_12/attn": 0.007278892677277327, "grad/layer_12/mlp": 0.007095340173691511, "grad/layer_12/attn_mlp_ratio": 1.0258694293022705, "grad/layer_16/attn": 0.0033095323015004396, "grad/layer_16/mlp": 0.004678115714341402, "grad/layer_16/attn_mlp_ratio": 0.7074498436645406, "grad/layer_20/attn": 0.0028367622289806604, "grad/layer_20/mlp": 0.005590913351625204, "grad/layer_20/attn_mlp_ratio": 0.5073879704140388, "grad/layer_24/attn": 0.009250742383301258, "grad/layer_24/mlp": 0.008738548494875431, "grad/layer_24/attn_mlp_ratio": 1.0586131418581564, "grad/layer_27/attn": 0.005304343998432159, "grad/layer_27/mlp": 0.008851856924593449, "grad/layer_27/attn_mlp_ratio": 0.5992351642931987} {"step": 12600, "timestamp": 1778339334.7694669, "eos/sharpness": 49.149060249328606, "eos/L0_probe": 2.313948154449463, "eos/L_plus": 2.582690715789795, "eos/L_minus": 2.536696195602417, "eos/grad_norm": 0.13928981125354767, "eos/embed_grad_frac": 0.10803037136793137, "eos/time_s": 0.6060154438018799} {"step": 12600, "timestamp": 1778339334.7888913, "train/loss": 2.3812265157699586, "train/z_loss": 0.0013569322996772825, "train/perplexity": 10.818163378139277, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910833.040579834, "perf/iters_per_sec": 0.9111561968707247, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0975066661834716, "data/tokens_consumed": 26426212352, "data/tokens_consumed_B": 26.426212352, "train/loss_slope": -1.5283878737299158e-06} {"step": 12600, "timestamp": 1778339336.1530018, "geo/rankme_last": 429.104248046875, "geo/layer_0/stable_rank_q_proj": 20.618541717529297, "geo/layer_0/stable_rank_k_proj": 16.939302444458008, "geo/layer_0/stable_rank_o_proj": 43.88175964355469, "geo/layer_0/stable_rank_gate_proj": 125.30802917480469, "geo/layer_0/stable_rank_down_proj": 57.57766342163086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06558874249458313, "geo/layer_0/attn_entropy_mean": 6.239836692810059, "geo/layer_0/attn_entropy_std": 0.4569106101989746, "geo/layer_7/stable_rank_q_proj": 41.79876708984375, "geo/layer_7/stable_rank_k_proj": 38.800079345703125, "geo/layer_7/stable_rank_o_proj": 88.5073471069336, "geo/layer_7/stable_rank_gate_proj": 78.41316223144531, "geo/layer_7/stable_rank_down_proj": 144.5098876953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3898604214191437, "geo/layer_7/attn_entropy_mean": 4.748312950134277, "geo/layer_7/attn_entropy_std": 0.774237334728241, "geo/layer_14/stable_rank_q_proj": 52.00276184082031, "geo/layer_14/stable_rank_k_proj": 43.34138870239258, "geo/layer_14/stable_rank_o_proj": 42.18349075317383, "geo/layer_14/stable_rank_gate_proj": 71.9321517944336, "geo/layer_14/stable_rank_down_proj": 127.71326446533203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3798656761646271, "geo/layer_14/attn_entropy_mean": 5.5122246742248535, "geo/layer_14/attn_entropy_std": 0.4681459963321686, "geo/layer_21/stable_rank_q_proj": 38.376731872558594, "geo/layer_21/stable_rank_k_proj": 28.616884231567383, "geo/layer_21/stable_rank_o_proj": 65.31187438964844, "geo/layer_21/stable_rank_gate_proj": 59.98929214477539, "geo/layer_21/stable_rank_down_proj": 48.72275924682617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13800457119941711, "geo/layer_21/attn_entropy_mean": 5.856539726257324, "geo/layer_21/attn_entropy_std": 0.3251020014286041, "geo/layer_27/stable_rank_q_proj": 44.64823532104492, "geo/layer_27/stable_rank_k_proj": 30.312288284301758, "geo/layer_27/stable_rank_o_proj": 107.44440460205078, "geo/layer_27/stable_rank_gate_proj": 70.03424835205078, "geo/layer_27/stable_rank_down_proj": 129.9882354736328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09891723841428757, "geo/layer_27/attn_entropy_mean": 4.293108940124512, "geo/layer_27/attn_entropy_std": 0.6720569729804993, "attnres/final_alpha/block_0": 0.2589946389198303, "attnres/block_norm/0": 1.7820392847061157, "attnres/final_alpha/block_1": 0.0037449956871569157, "attnres/block_norm/1": 50721.61328125, "attnres/final_alpha/block_2": 0.008147984743118286, "attnres/block_norm/2": 29993.57421875, "attnres/final_alpha/block_3": 0.01041538454592228, "attnres/block_norm/3": 73133.90625, "attnres/final_alpha/block_4": 0.011616596952080727, "attnres/block_norm/4": 17529.830078125, "attnres/final_alpha/block_5": 0.6094956398010254, "attnres/block_norm/5": 7144.5400390625, "attnres/final_alpha/block_6": 0.09758472442626953, "attnres/block_norm/6": 49018.89453125, "geo/tier1_time_s": 1.3601458072662354, "geo/step": 12600.0, "geo/rankme_slope": 0.00023380150888480393} {"step": 12610, "timestamp": 1778339346.5246158, "train/loss": 2.3306109428405763, "train/z_loss": 0.0013597074663266539, "train/perplexity": 10.284222686356376, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787572.7981542754, "perf/iters_per_sec": 0.852381133153093, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1731841087341308, "data/tokens_consumed": 26447183872, "data/tokens_consumed_B": 26.447183872, "train/loss_slope": -1.3177260242350244e-06} {"step": 12620, "timestamp": 1778339356.8756344, "train/loss": 2.34494469165802, "train/z_loss": 0.0013595727737993, "train/perplexity": 10.432695696490393, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027436.2098370597, "perf/iters_per_sec": 0.9667569207368182, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343861818313598, "data/tokens_consumed": 26468155392, "data/tokens_consumed_B": 26.468155392, "train/loss_slope": -5.46568118877637e-08} {"step": 12630, "timestamp": 1778339367.2276766, "train/loss": 2.3654454946517944, "train/z_loss": 0.0013651743764057755, "train/perplexity": 10.648781736270731, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026882.9262607696, "perf/iters_per_sec": 0.9664930945686195, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03466854095459, "data/tokens_consumed": 26489126912, "data/tokens_consumed_B": 26.489126912, "train/loss_slope": 1.7981906642508467e-06} {"step": 12640, "timestamp": 1778339377.5837867, "train/loss": 2.3438612699508665, "train/z_loss": 0.0013623753911815584, "train/perplexity": 10.42139880826068, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026659.6054304135, "perf/iters_per_sec": 0.966386606898505, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347825527191161, "data/tokens_consumed": 26510098432, "data/tokens_consumed_B": 26.510098432, "train/loss_slope": 1.275198172779094e-06} {"step": 12650, "timestamp": 1778339387.9287872, "grad/layer_0/attn": 0.003114663064479828, "grad/layer_0/mlp": 0.00334514444693923, "grad/layer_0/attn_mlp_ratio": 0.9310996941312177, "grad/layer_4/attn": 0.0020976439118385315, "grad/layer_4/mlp": 0.002572581171989441, "grad/layer_4/attn_mlp_ratio": 0.8153848955824713, "grad/layer_8/attn": 0.003733831923455, "grad/layer_8/mlp": 0.0034779647830873728, "grad/layer_8/attn_mlp_ratio": 1.0735680344594207, "grad/layer_12/attn": 0.004835017956793308, "grad/layer_12/mlp": 0.007041964214295149, "grad/layer_12/attn_mlp_ratio": 0.68660074675162, "grad/layer_16/attn": 0.0043204533867537975, "grad/layer_16/mlp": 0.004947475157678127, "grad/layer_16/attn_mlp_ratio": 0.8732642735400775, "grad/layer_20/attn": 0.004496017936617136, "grad/layer_20/mlp": 0.0062187835574150085, "grad/layer_20/attn_mlp_ratio": 0.7229738457385118, "grad/layer_24/attn": 0.00743882218375802, "grad/layer_24/mlp": 0.00871674157679081, "grad/layer_24/attn_mlp_ratio": 0.8533948187961821, "grad/layer_27/attn": 0.009332654066383839, "grad/layer_27/mlp": 0.007478400133550167, "grad/layer_27/attn_mlp_ratio": 1.2479479266855735} {"step": 12650, "timestamp": 1778339387.9444685, "train/loss": 2.3146870851516725, "train/z_loss": 0.0013632370624691247, "train/perplexity": 10.12175517566663, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025475.408247913, "perf/iters_per_sec": 0.9658219376792493, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353875398635863, "data/tokens_consumed": 26531069952, "data/tokens_consumed_B": 26.531069952, "train/loss_slope": 4.4011464535286504e-07} {"step": 12660, "timestamp": 1778339398.2864482, "train/loss": 2.286143922805786, "train/z_loss": 0.0013698686379939318, "train/perplexity": 9.836932481770512, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028756.4237987678, "perf/iters_per_sec": 0.9673864478105392, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337130546569824, "data/tokens_consumed": 26552041472, "data/tokens_consumed_B": 26.552041472, "train/loss_slope": -2.1251870746766213e-06} {"step": 12670, "timestamp": 1778339408.6363556, "train/loss": 2.36317241191864, "train/z_loss": 0.0013654767884872855, "train/perplexity": 10.624603664166479, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027692.0005787022, "perf/iters_per_sec": 0.9668788912671576, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342556953430175, "data/tokens_consumed": 26573012992, "data/tokens_consumed_B": 26.573012992, "train/loss_slope": -2.5280229400800774e-07} {"step": 12675, "timestamp": 1778339414.403717, "eos/sharpness": 20.54493427276611, "eos/L0_probe": 2.3104071617126465, "eos/L_plus": 2.439190149307251, "eos/L_minus": 2.387073516845703, "eos/grad_norm": 0.1023787409067154, "eos/embed_grad_frac": 0.19357530772686005, "eos/time_s": 0.5991983413696289} {"step": 12675, "timestamp": 1778339415.7792377, "geo/rankme_last": 429.2929382324219, "geo/layer_0/stable_rank_q_proj": 20.591279983520508, "geo/layer_0/stable_rank_k_proj": 16.959592819213867, "geo/layer_0/stable_rank_o_proj": 43.850738525390625, "geo/layer_0/stable_rank_gate_proj": 125.2347640991211, "geo/layer_0/stable_rank_down_proj": 57.609703063964844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06246289238333702, "geo/layer_0/attn_entropy_mean": 6.240013122558594, "geo/layer_0/attn_entropy_std": 0.45683932304382324, "geo/layer_7/stable_rank_q_proj": 41.83344268798828, "geo/layer_7/stable_rank_k_proj": 38.640846252441406, "geo/layer_7/stable_rank_o_proj": 88.48445129394531, "geo/layer_7/stable_rank_gate_proj": 78.41508483886719, "geo/layer_7/stable_rank_down_proj": 144.7336883544922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40041977167129517, "geo/layer_7/attn_entropy_mean": 4.722936630249023, "geo/layer_7/attn_entropy_std": 0.785730242729187, "geo/layer_14/stable_rank_q_proj": 51.91545104980469, "geo/layer_14/stable_rank_k_proj": 43.35169982910156, "geo/layer_14/stable_rank_o_proj": 42.219825744628906, "geo/layer_14/stable_rank_gate_proj": 71.81871032714844, "geo/layer_14/stable_rank_down_proj": 127.790771484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3773711025714874, "geo/layer_14/attn_entropy_mean": 5.507069110870361, "geo/layer_14/attn_entropy_std": 0.44901105761528015, "geo/layer_21/stable_rank_q_proj": 38.38076400756836, "geo/layer_21/stable_rank_k_proj": 28.60634994506836, "geo/layer_21/stable_rank_o_proj": 65.2860336303711, "geo/layer_21/stable_rank_gate_proj": 59.9326057434082, "geo/layer_21/stable_rank_down_proj": 48.6856803894043, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13306894898414612, "geo/layer_21/attn_entropy_mean": 5.852718353271484, "geo/layer_21/attn_entropy_std": 0.3313375413417816, "geo/layer_27/stable_rank_q_proj": 44.57907485961914, "geo/layer_27/stable_rank_k_proj": 30.346200942993164, "geo/layer_27/stable_rank_o_proj": 107.14725494384766, "geo/layer_27/stable_rank_gate_proj": 69.9919662475586, "geo/layer_27/stable_rank_down_proj": 130.2342987060547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09886538237333298, "geo/layer_27/attn_entropy_mean": 4.296870231628418, "geo/layer_27/attn_entropy_std": 0.672417163848877, "attnres/final_alpha/block_0": 0.26201826333999634, "attnres/block_norm/0": 1.7819381952285767, "attnres/final_alpha/block_1": 0.003817611839622259, "attnres/block_norm/1": 50623.3671875, "attnres/final_alpha/block_2": 0.00828326866030693, "attnres/block_norm/2": 29971.40625, "attnres/final_alpha/block_3": 0.01072600856423378, "attnres/block_norm/3": 72785.328125, "attnres/final_alpha/block_4": 0.011899376288056374, "attnres/block_norm/4": 17497.3984375, "attnres/final_alpha/block_5": 0.6043765544891357, "attnres/block_norm/5": 7187.68505859375, "attnres/final_alpha/block_6": 0.09887893497943878, "attnres/block_norm/6": 48852.78515625, "geo/tier1_time_s": 1.356358289718628, "geo/step": 12675.0, "geo/rankme_slope": 0.00019664115646258502} {"step": 12680, "timestamp": 1778339420.954821, "train/loss": 2.2941204071044923, "train/z_loss": 0.001365370349958539, "train/perplexity": 9.915710386938628, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703473.4906147951, "perf/iters_per_sec": 0.8122794583391166, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2311033964157105, "data/tokens_consumed": 26593984512, "data/tokens_consumed_B": 26.593984512, "train/loss_slope": -1.973743321406957e-06} {"step": 12690, "timestamp": 1778339431.3052366, "train/loss": 2.301926040649414, "train/z_loss": 0.0013637324329465628, "train/perplexity": 9.993411647826628, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027105.455429732, "perf/iters_per_sec": 0.9665992047451649, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345549583435059, "data/tokens_consumed": 26614956032, "data/tokens_consumed_B": 26.614956032, "train/loss_slope": -3.023225725358717e-06} {"step": 12700, "timestamp": 1778339441.653627, "grad/layer_0/attn": 0.002833561273291707, "grad/layer_0/mlp": 0.0030908670742064714, "grad/layer_0/attn_mlp_ratio": 0.9167528442949596, "grad/layer_4/attn": 0.0024977312423288822, "grad/layer_4/mlp": 0.0027319693472236395, "grad/layer_4/attn_mlp_ratio": 0.914260313147791, "grad/layer_8/attn": 0.004301925655454397, "grad/layer_8/mlp": 0.00348155340179801, "grad/layer_8/attn_mlp_ratio": 1.235633935607399, "grad/layer_12/attn": 0.006958990357816219, "grad/layer_12/mlp": 0.007345310412347317, "grad/layer_12/attn_mlp_ratio": 0.9474058783652916, "grad/layer_16/attn": 0.0077072069980204105, "grad/layer_16/mlp": 0.004968270659446716, "grad/layer_16/attn_mlp_ratio": 1.5512856225409728, "grad/layer_20/attn": 0.0036613778211176395, "grad/layer_20/mlp": 0.006207131315022707, "grad/layer_20/attn_mlp_ratio": 0.5898663289545072, "grad/layer_24/attn": 0.011101569049060345, "grad/layer_24/mlp": 0.01048275362700224, "grad/layer_24/attn_mlp_ratio": 1.059031752359508, "grad/layer_27/attn": 0.005384143907576799, "grad/layer_27/mlp": 0.011503064073622227, "grad/layer_27/attn_mlp_ratio": 0.4680617117587873} {"step": 12700, "timestamp": 1778339441.6695392, "train/loss": 2.327132058143616, "train/z_loss": 0.0013668725732713938, "train/perplexity": 10.248507222444042, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024866.511743457, "perf/iters_per_sec": 0.9655315932004247, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356988906860352, "data/tokens_consumed": 26635927552, "data/tokens_consumed_B": 26.635927552, "train/loss_slope": -5.745166549087375e-06} {"step": 12710, "timestamp": 1778339452.47787, "train/loss": 2.3517821550369264, "train/z_loss": 0.0013587448513135314, "train/perplexity": 10.504273297009986, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1941370.9143265919, "perf/iters_per_sec": 0.9257177898056945, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0802428245544433, "data/tokens_consumed": 26656899072, "data/tokens_consumed_B": 26.656899072, "train/loss_slope": -2.185554759050939e-06} {"step": 12720, "timestamp": 1778339462.8279748, "train/loss": 2.334197521209717, "train/z_loss": 0.0013613714603707195, "train/perplexity": 10.32117408191421, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027446.8178175748, "perf/iters_per_sec": 0.9667619790161013, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343807697296143, "data/tokens_consumed": 26677870592, "data/tokens_consumed_B": 26.677870592, "train/loss_slope": -2.45564820134805e-06} {"step": 12730, "timestamp": 1778339473.18171, "train/loss": 2.3343369245529173, "train/z_loss": 0.0013652144116349518, "train/perplexity": 10.322612988378838, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026924.027885903, "perf/iters_per_sec": 0.9665126933507456, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034647560119629, "data/tokens_consumed": 26698842112, "data/tokens_consumed_B": 26.698842112, "train/loss_slope": -3.2679631908189303e-07} {"step": 12740, "timestamp": 1778339483.5418582, "train/loss": 2.3667967081069947, "train/z_loss": 0.001352701080031693, "train/perplexity": 10.663180238968856, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025314.8840230524, "perf/iters_per_sec": 0.965745393764044, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354696035385131, "data/tokens_consumed": 26719813632, "data/tokens_consumed_B": 26.719813632, "train/loss_slope": 3.110457968862154e-06} {"step": 12750, "timestamp": 1778339493.8887007, "grad/layer_0/attn": 0.00323860882781446, "grad/layer_0/mlp": 0.003425785806030035, "grad/layer_0/attn_mlp_ratio": 0.9453622954411401, "grad/layer_4/attn": 0.004053112585097551, "grad/layer_4/mlp": 0.002646468114107847, "grad/layer_4/attn_mlp_ratio": 1.5315175763272482, "grad/layer_8/attn": 0.005910971201956272, "grad/layer_8/mlp": 0.0034223312977701426, "grad/layer_8/attn_mlp_ratio": 1.7271767444285582, "grad/layer_12/attn": 0.0046461597084999084, "grad/layer_12/mlp": 0.007109066471457481, "grad/layer_12/attn_mlp_ratio": 0.6535541145660373, "grad/layer_16/attn": 0.007575855124741793, "grad/layer_16/mlp": 0.005086016841232777, "grad/layer_16/attn_mlp_ratio": 1.4895457904049993, "grad/layer_20/attn": 0.0034059444442391396, "grad/layer_20/mlp": 0.0064869411289691925, "grad/layer_20/attn_mlp_ratio": 0.5250462928550936, "grad/layer_24/attn": 0.017974819988012314, "grad/layer_24/mlp": 0.014310966245830059, "grad/layer_24/attn_mlp_ratio": 1.256017207618536, "grad/layer_27/attn": 0.015897581353783607, "grad/layer_27/mlp": 0.01587888039648533, "grad/layer_27/attn_mlp_ratio": 1.0011777188764923} {"step": 12750, "timestamp": 1778339494.4965472, "eos/sharpness": 75.33872127532958, "eos/L0_probe": 2.30598521232605, "eos/L_plus": 2.7643885612487793, "eos/L_minus": 2.600969076156616, "eos/grad_norm": 0.3203999400138855, "eos/embed_grad_frac": 0.02195177972316742, "eos/time_s": 0.6050851345062256} {"step": 12750, "timestamp": 1778339494.517199, "train/loss": 2.3248101711273192, "train/z_loss": 0.0013588033965788783, "train/perplexity": 10.2247389508866, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911729.3356780387, "perf/iters_per_sec": 0.911583583678264, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0969921112060548, "data/tokens_consumed": 26740785152, "data/tokens_consumed_B": 26.740785152, "train/loss_slope": 4.888702490434962e-06} {"step": 12750, "timestamp": 1778339495.8805132, "geo/rankme_last": 429.3449401855469, "geo/layer_0/stable_rank_q_proj": 20.59857940673828, "geo/layer_0/stable_rank_k_proj": 16.96286392211914, "geo/layer_0/stable_rank_o_proj": 43.88359069824219, "geo/layer_0/stable_rank_gate_proj": 125.0933837890625, "geo/layer_0/stable_rank_down_proj": 57.59823226928711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06471249461174011, "geo/layer_0/attn_entropy_mean": 6.238194942474365, "geo/layer_0/attn_entropy_std": 0.4595656096935272, "geo/layer_7/stable_rank_q_proj": 41.827274322509766, "geo/layer_7/stable_rank_k_proj": 38.69124221801758, "geo/layer_7/stable_rank_o_proj": 88.5497817993164, "geo/layer_7/stable_rank_gate_proj": 78.33918762207031, "geo/layer_7/stable_rank_down_proj": 144.86338806152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3973254859447479, "geo/layer_7/attn_entropy_mean": 4.722450256347656, "geo/layer_7/attn_entropy_std": 0.7655310034751892, "geo/layer_14/stable_rank_q_proj": 51.881317138671875, "geo/layer_14/stable_rank_k_proj": 43.36043167114258, "geo/layer_14/stable_rank_o_proj": 42.25048065185547, "geo/layer_14/stable_rank_gate_proj": 71.81278228759766, "geo/layer_14/stable_rank_down_proj": 127.70354461669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3921106159687042, "geo/layer_14/attn_entropy_mean": 5.524059295654297, "geo/layer_14/attn_entropy_std": 0.4589611291885376, "geo/layer_21/stable_rank_q_proj": 38.40550231933594, "geo/layer_21/stable_rank_k_proj": 28.60122299194336, "geo/layer_21/stable_rank_o_proj": 65.3418960571289, "geo/layer_21/stable_rank_gate_proj": 60.00971984863281, "geo/layer_21/stable_rank_down_proj": 48.707923889160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13494203984737396, "geo/layer_21/attn_entropy_mean": 5.828907012939453, "geo/layer_21/attn_entropy_std": 0.33328524231910706, "geo/layer_27/stable_rank_q_proj": 44.648799896240234, "geo/layer_27/stable_rank_k_proj": 30.37067413330078, "geo/layer_27/stable_rank_o_proj": 107.04740905761719, "geo/layer_27/stable_rank_gate_proj": 69.98086547851562, "geo/layer_27/stable_rank_down_proj": 130.30923461914062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10198399424552917, "geo/layer_27/attn_entropy_mean": 4.325872898101807, "geo/layer_27/attn_entropy_std": 0.7084136605262756, "attnres/final_alpha/block_0": 0.25861430168151855, "attnres/block_norm/0": 1.781945824623108, "attnres/final_alpha/block_1": 0.003765830770134926, "attnres/block_norm/1": 50729.0234375, "attnres/final_alpha/block_2": 0.0080255176872015, "attnres/block_norm/2": 29972.61328125, "attnres/final_alpha/block_3": 0.010424831882119179, "attnres/block_norm/3": 73204.1640625, "attnres/final_alpha/block_4": 0.011730337515473366, "attnres/block_norm/4": 17478.78125, "attnres/final_alpha/block_5": 0.6100912690162659, "attnres/block_norm/5": 7229.3916015625, "attnres/final_alpha/block_6": 0.0973479375243187, "attnres/block_norm/6": 49058.06640625, "geo/tier1_time_s": 1.3590869903564453, "geo/step": 12750.0, "geo/rankme_slope": 0.00018579826461834735} {"step": 12760, "timestamp": 1778339506.234398, "train/loss": 2.308724522590637, "train/z_loss": 0.001359567791223526, "train/perplexity": 10.061583145222535, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790422.3720085444, "perf/iters_per_sec": 0.8537399158518526, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171316909790039, "data/tokens_consumed": 26761756672, "data/tokens_consumed_B": 26.761756672, "train/loss_slope": 3.4803417828432117e-06} {"step": 12770, "timestamp": 1778339517.1735895, "train/loss": 2.316119575500488, "train/z_loss": 0.0013580385595560074, "train/perplexity": 10.136264882295007, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917873.458889195, "perf/iters_per_sec": 0.9145133299299216, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0934777736663819, "data/tokens_consumed": 26782728192, "data/tokens_consumed_B": 26.782728192, "train/loss_slope": 1.1865940746373318e-06} {"step": 12780, "timestamp": 1778339527.5419688, "train/loss": 2.3428173303604125, "train/z_loss": 0.001368127786554396, "train/perplexity": 10.41052517415291, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024579.512265632, "perf/iters_per_sec": 0.9653947411850128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358457088470459, "data/tokens_consumed": 26803699712, "data/tokens_consumed_B": 26.803699712, "train/loss_slope": 1.2940308370284804e-06} {"step": 12790, "timestamp": 1778339537.8879175, "train/loss": 2.279861402511597, "train/z_loss": 0.0013812778866849839, "train/perplexity": 9.775325480076303, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028438.1957163878, "perf/iters_per_sec": 0.9672347048360767, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338752269744873, "data/tokens_consumed": 26824671232, "data/tokens_consumed_B": 26.824671232, "train/loss_slope": -5.028639217414392e-06} {"step": 12800, "timestamp": 1778339548.2314591, "grad/layer_0/attn": 0.003209728980436921, "grad/layer_0/mlp": 0.0034882910549640656, "grad/layer_0/attn_mlp_ratio": 0.9201436571225621, "grad/layer_4/attn": 0.0018963960465043783, "grad/layer_4/mlp": 0.002664903411641717, "grad/layer_4/attn_mlp_ratio": 0.7116190279385017, "grad/layer_8/attn": 0.005840189754962921, "grad/layer_8/mlp": 0.0034985810052603483, "grad/layer_8/attn_mlp_ratio": 1.6693023769498467, "grad/layer_12/attn": 0.007642768323421478, "grad/layer_12/mlp": 0.006667185574769974, "grad/layer_12/attn_mlp_ratio": 1.146326005640328, "grad/layer_16/attn": 0.0036145467311143875, "grad/layer_16/mlp": 0.0050459979102015495, "grad/layer_16/attn_mlp_ratio": 0.7163194919631001, "grad/layer_20/attn": 0.003542503109201789, "grad/layer_20/mlp": 0.006252989172935486, "grad/layer_20/attn_mlp_ratio": 0.5665295356469962, "grad/layer_24/attn": 0.012533565051853657, "grad/layer_24/mlp": 0.010832003317773342, "grad/layer_24/attn_mlp_ratio": 1.1570865119270886, "grad/layer_27/attn": 0.00906241126358509, "grad/layer_27/mlp": 0.01127593219280243, "grad/layer_27/attn_mlp_ratio": 0.8036950762261799} {"step": 12800, "timestamp": 1778339548.247143, "train/loss": 2.3311187028884888, "train/z_loss": 0.0013686895836144686, "train/perplexity": 10.289445929725272, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025454.280276157, "perf/iters_per_sec": 0.9658118630772385, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353983402252198, "data/tokens_consumed": 26845642752, "data/tokens_consumed_B": 26.845642752, "train/loss_slope": -5.557690330571672e-06} {"step": 12810, "timestamp": 1778339558.6034775, "train/loss": 2.3312155961990357, "train/z_loss": 0.0013558803591877222, "train/perplexity": 10.29044295650693, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026428.444093778, "perf/iters_per_sec": 0.9662763805836573, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349005937576294, "data/tokens_consumed": 26866614272, "data/tokens_consumed_B": 26.866614272, "train/loss_slope": -5.087124818515102e-06} {"step": 12820, "timestamp": 1778339568.948575, "train/loss": 2.34613835811615, "train/z_loss": 0.001349346071947366, "train/perplexity": 10.445156290828395, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028312.887476474, "perf/iters_per_sec": 0.9671749532110567, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339390993118287, "data/tokens_consumed": 26887585792, "data/tokens_consumed_B": 26.887585792, "train/loss_slope": -3.909587502443719e-06} {"step": 12825, "timestamp": 1778339574.7104514, "eos/sharpness": 43.60105991363525, "eos/L0_probe": 2.309386968612671, "eos/L_plus": 2.5352606773376465, "eos/L_minus": 2.519523859024048, "eos/grad_norm": 0.16955311596393585, "eos/embed_grad_frac": 0.08638427406549454, "eos/time_s": 0.5998685359954834} {"step": 12825, "timestamp": 1778339576.0895958, "geo/rankme_last": 429.690673828125, "geo/layer_0/stable_rank_q_proj": 20.60352897644043, "geo/layer_0/stable_rank_k_proj": 16.93429946899414, "geo/layer_0/stable_rank_o_proj": 43.84419631958008, "geo/layer_0/stable_rank_gate_proj": 125.09974670410156, "geo/layer_0/stable_rank_down_proj": 57.67108154296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06599432975053787, "geo/layer_0/attn_entropy_mean": 6.231846809387207, "geo/layer_0/attn_entropy_std": 0.45985010266304016, "geo/layer_7/stable_rank_q_proj": 41.88517761230469, "geo/layer_7/stable_rank_k_proj": 38.74050521850586, "geo/layer_7/stable_rank_o_proj": 88.50818634033203, "geo/layer_7/stable_rank_gate_proj": 78.27311706542969, "geo/layer_7/stable_rank_down_proj": 144.60980224609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.402660995721817, "geo/layer_7/attn_entropy_mean": 4.738250255584717, "geo/layer_7/attn_entropy_std": 0.7575319409370422, "geo/layer_14/stable_rank_q_proj": 51.826629638671875, "geo/layer_14/stable_rank_k_proj": 43.48976516723633, "geo/layer_14/stable_rank_o_proj": 42.26788330078125, "geo/layer_14/stable_rank_gate_proj": 71.83129119873047, "geo/layer_14/stable_rank_down_proj": 127.58088684082031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3849918246269226, "geo/layer_14/attn_entropy_mean": 5.521223068237305, "geo/layer_14/attn_entropy_std": 0.47841954231262207, "geo/layer_21/stable_rank_q_proj": 38.323768615722656, "geo/layer_21/stable_rank_k_proj": 28.591325759887695, "geo/layer_21/stable_rank_o_proj": 65.2686538696289, "geo/layer_21/stable_rank_gate_proj": 59.96665954589844, "geo/layer_21/stable_rank_down_proj": 48.777976989746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13732512295246124, "geo/layer_21/attn_entropy_mean": 5.85154914855957, "geo/layer_21/attn_entropy_std": 0.3381873667240143, "geo/layer_27/stable_rank_q_proj": 44.69927978515625, "geo/layer_27/stable_rank_k_proj": 30.428857803344727, "geo/layer_27/stable_rank_o_proj": 106.99526977539062, "geo/layer_27/stable_rank_gate_proj": 69.98526000976562, "geo/layer_27/stable_rank_down_proj": 130.392578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10623207688331604, "geo/layer_27/attn_entropy_mean": 4.328618049621582, "geo/layer_27/attn_entropy_std": 0.6858609318733215, "attnres/final_alpha/block_0": 0.260027676820755, "attnres/block_norm/0": 1.7818506956100464, "attnres/final_alpha/block_1": 0.0037968901451677084, "attnres/block_norm/1": 50797.4140625, "attnres/final_alpha/block_2": 0.008114604279398918, "attnres/block_norm/2": 29969.361328125, "attnres/final_alpha/block_3": 0.0103802140802145, "attnres/block_norm/3": 72858.890625, "attnres/final_alpha/block_4": 0.011780318804085255, "attnres/block_norm/4": 17558.044921875, "attnres/final_alpha/block_5": 0.6077721118927002, "attnres/block_norm/5": 7245.45947265625, "attnres/final_alpha/block_6": 0.09812816977500916, "attnres/block_norm/6": 49217.921875, "geo/tier1_time_s": 1.3600120544433594, "geo/step": 12825.0, "geo/rankme_slope": 0.00015128459586959784} {"step": 12830, "timestamp": 1778339581.2746112, "train/loss": 2.3439106941223145, "train/z_loss": 0.0013610155205242337, "train/perplexity": 10.421913889990746, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702092.4335182807, "perf/iters_per_sec": 0.8116209189978985, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2321022987365722, "data/tokens_consumed": 26908557312, "data/tokens_consumed_B": 26.908557312, "train/loss_slope": -1.2265197657765529e-06} {"step": 12840, "timestamp": 1778339591.6335824, "train/loss": 2.3613622903823854, "train/z_loss": 0.0013557536411099135, "train/perplexity": 10.605389235731561, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025846.642176967, "perf/iters_per_sec": 0.965998955811008, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351978063583374, "data/tokens_consumed": 26929528832, "data/tokens_consumed_B": 26.929528832, "train/loss_slope": 2.5536438455247726e-06} {"step": 12850, "timestamp": 1778339601.972155, "grad/layer_0/attn": 0.0029566616285592318, "grad/layer_0/mlp": 0.003191123716533184, "grad/layer_0/attn_mlp_ratio": 0.9265267656619258, "grad/layer_4/attn": 0.00174546183552593, "grad/layer_4/mlp": 0.0025773728266358376, "grad/layer_4/attn_mlp_ratio": 0.6772251766469144, "grad/layer_8/attn": 0.004026668146252632, "grad/layer_8/mlp": 0.0036605969071388245, "grad/layer_8/attn_mlp_ratio": 1.1000031247361846, "grad/layer_12/attn": 0.004710930399596691, "grad/layer_12/mlp": 0.00699307955801487, "grad/layer_12/attn_mlp_ratio": 0.6736560471175852, "grad/layer_16/attn": 0.0031811397057026625, "grad/layer_16/mlp": 0.004398892167955637, "grad/layer_16/attn_mlp_ratio": 0.7231683596518451, "grad/layer_20/attn": 0.00243117893114686, "grad/layer_20/mlp": 0.0054968660697340965, "grad/layer_20/attn_mlp_ratio": 0.4422845410595953, "grad/layer_24/attn": 0.0050203693099319935, "grad/layer_24/mlp": 0.007583938539028168, "grad/layer_24/attn_mlp_ratio": 0.6619738830818549, "grad/layer_27/attn": 0.007757165003567934, "grad/layer_27/mlp": 0.006776061374694109, "grad/layer_27/attn_mlp_ratio": 1.1447896440340537} {"step": 12850, "timestamp": 1778339601.9880614, "train/loss": 2.3120537757873536, "train/z_loss": 0.0013564498745836317, "train/perplexity": 10.095136525930858, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026754.0274301067, "perf/iters_per_sec": 0.9664316308165105, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034734344482422, "data/tokens_consumed": 26950500352, "data/tokens_consumed_B": 26.950500352, "train/loss_slope": -7.140256462245184e-07} {"step": 12860, "timestamp": 1778339612.3349082, "train/loss": 2.3286417007446287, "train/z_loss": 0.0013590804766863585, "train/perplexity": 10.263990489703202, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027939.580352821, "perf/iters_per_sec": 0.966996946503077, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341294288635254, "data/tokens_consumed": 26971471872, "data/tokens_consumed_B": 26.971471872, "train/loss_slope": -3.1790587982900415e-06} {"step": 12870, "timestamp": 1778339622.6924093, "train/loss": 2.329746055603027, "train/z_loss": 0.0013551287003792823, "train/perplexity": 10.275331838751363, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026212.317901795, "perf/iters_per_sec": 0.9661733235844588, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350109815597535, "data/tokens_consumed": 26992443392, "data/tokens_consumed_B": 26.992443392, "train/loss_slope": -4.248394524053176e-06} {"step": 12880, "timestamp": 1778339633.0457265, "train/loss": 2.3395434856414794, "train/z_loss": 0.0013657678966410459, "train/perplexity": 10.376498460769835, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026985.0295068843, "perf/iters_per_sec": 0.9665417811903402, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346164226531982, "data/tokens_consumed": 27013414912, "data/tokens_consumed_B": 27.013414912, "train/loss_slope": -3.7517863925140258e-06} {"step": 12890, "timestamp": 1778339643.4087439, "train/loss": 2.354162502288818, "train/z_loss": 0.001367336744442582, "train/perplexity": 10.529306897596683, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024661.9964760758, "perf/iters_per_sec": 0.9654340727215175, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358035087585449, "data/tokens_consumed": 27034386432, "data/tokens_consumed_B": 27.034386432, "train/loss_slope": -1.529323707784254e-06} {"step": 12900, "timestamp": 1778339653.7625742, "grad/layer_0/attn": 0.0030732741579413414, "grad/layer_0/mlp": 0.0033285978715866804, "grad/layer_0/attn_mlp_ratio": 0.9232938865477854, "grad/layer_4/attn": 0.0028868811205029488, "grad/layer_4/mlp": 0.0026414894964545965, "grad/layer_4/attn_mlp_ratio": 1.09289891748115, "grad/layer_8/attn": 0.004535581450909376, "grad/layer_8/mlp": 0.0035380914341658354, "grad/layer_8/attn_mlp_ratio": 1.2819288045860904, "grad/layer_12/attn": 0.007549058645963669, "grad/layer_12/mlp": 0.006737121846526861, "grad/layer_12/attn_mlp_ratio": 1.120516847680838, "grad/layer_16/attn": 0.004971531685441732, "grad/layer_16/mlp": 0.00481616985052824, "grad/layer_16/attn_mlp_ratio": 1.0322583580956175, "grad/layer_20/attn": 0.0038591730408370495, "grad/layer_20/mlp": 0.006299665663391352, "grad/layer_20/attn_mlp_ratio": 0.6125996498518215, "grad/layer_24/attn": 0.014887542463839054, "grad/layer_24/mlp": 0.014025571756064892, "grad/layer_24/attn_mlp_ratio": 1.0614570740230769, "grad/layer_27/attn": 0.008071309886872768, "grad/layer_27/mlp": 0.015323814004659653, "grad/layer_27/attn_mlp_ratio": 0.5267167711476252} {"step": 12900, "timestamp": 1778339654.3680382, "eos/sharpness": 70.21327018737792, "eos/L0_probe": 2.31428861618042, "eos/L_plus": 2.6296298503875732, "eos/L_minus": 2.701080083847046, "eos/grad_norm": 0.2761251926422119, "eos/embed_grad_frac": 0.030496591702103615, "eos/time_s": 0.6025912761688232} {"step": 12900, "timestamp": 1778339654.3893404, "train/loss": 2.28223922252655, "train/z_loss": 0.0013638763572089373, "train/perplexity": 9.79859710155431, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911157.7476535821, "perf/iters_per_sec": 0.9113110292690192, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0973201990127563, "data/tokens_consumed": 27055357952, "data/tokens_consumed_B": 27.055357952, "train/loss_slope": -3.502882274463993e-06} {"step": 12900, "timestamp": 1778339655.7496657, "geo/rankme_last": 428.7851867675781, "geo/layer_0/stable_rank_q_proj": 20.585308074951172, "geo/layer_0/stable_rank_k_proj": 16.956600189208984, "geo/layer_0/stable_rank_o_proj": 43.8266487121582, "geo/layer_0/stable_rank_gate_proj": 124.94048309326172, "geo/layer_0/stable_rank_down_proj": 57.68608093261719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06914138048887253, "geo/layer_0/attn_entropy_mean": 6.233861923217773, "geo/layer_0/attn_entropy_std": 0.45832765102386475, "geo/layer_7/stable_rank_q_proj": 41.841331481933594, "geo/layer_7/stable_rank_k_proj": 38.90098190307617, "geo/layer_7/stable_rank_o_proj": 88.33316040039062, "geo/layer_7/stable_rank_gate_proj": 78.29195404052734, "geo/layer_7/stable_rank_down_proj": 144.47433471679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3980521559715271, "geo/layer_7/attn_entropy_mean": 4.725202560424805, "geo/layer_7/attn_entropy_std": 0.7796475887298584, "geo/layer_14/stable_rank_q_proj": 51.78382110595703, "geo/layer_14/stable_rank_k_proj": 43.43913269042969, "geo/layer_14/stable_rank_o_proj": 42.32168960571289, "geo/layer_14/stable_rank_gate_proj": 71.93656921386719, "geo/layer_14/stable_rank_down_proj": 127.57415771484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.381542444229126, "geo/layer_14/attn_entropy_mean": 5.516902446746826, "geo/layer_14/attn_entropy_std": 0.46466749906539917, "geo/layer_21/stable_rank_q_proj": 38.365421295166016, "geo/layer_21/stable_rank_k_proj": 28.603763580322266, "geo/layer_21/stable_rank_o_proj": 65.22512817382812, "geo/layer_21/stable_rank_gate_proj": 59.954673767089844, "geo/layer_21/stable_rank_down_proj": 48.776737213134766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13577282428741455, "geo/layer_21/attn_entropy_mean": 5.860418319702148, "geo/layer_21/attn_entropy_std": 0.3332555890083313, "geo/layer_27/stable_rank_q_proj": 44.70629119873047, "geo/layer_27/stable_rank_k_proj": 30.426809310913086, "geo/layer_27/stable_rank_o_proj": 106.72608947753906, "geo/layer_27/stable_rank_gate_proj": 69.95207214355469, "geo/layer_27/stable_rank_down_proj": 130.55648803710938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11149758845567703, "geo/layer_27/attn_entropy_mean": 4.347964286804199, "geo/layer_27/attn_entropy_std": 0.6785686016082764, "attnres/final_alpha/block_0": 0.26339155435562134, "attnres/block_norm/0": 1.781968355178833, "attnres/final_alpha/block_1": 0.003839417127892375, "attnres/block_norm/1": 50640.73828125, "attnres/final_alpha/block_2": 0.008175954222679138, "attnres/block_norm/2": 29986.720703125, "attnres/final_alpha/block_3": 0.010432610288262367, "attnres/block_norm/3": 72905.546875, "attnres/final_alpha/block_4": 0.011751687154173851, "attnres/block_norm/4": 17598.794921875, "attnres/final_alpha/block_5": 0.6023393869400024, "attnres/block_norm/5": 7233.3916015625, "attnres/final_alpha/block_6": 0.10006938874721527, "attnres/block_norm/6": 48994.5234375, "geo/tier1_time_s": 1.3560280799865723, "geo/step": 12900.0, "geo/rankme_slope": 0.0001059073824842437} {"step": 12910, "timestamp": 1778339666.1036897, "train/loss": 2.3153478384017943, "train/z_loss": 0.0013663288787938654, "train/perplexity": 10.128445368335754, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790867.8235045536, "perf/iters_per_sec": 0.853952323677327, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171025562286377, "data/tokens_consumed": 27076329472, "data/tokens_consumed_B": 27.076329472, "train/loss_slope": -2.432742191798827e-06} {"step": 12920, "timestamp": 1778339676.4515336, "train/loss": 2.3131715059280396, "train/z_loss": 0.0013769970973953605, "train/perplexity": 10.106426472681719, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027678.2583064197, "perf/iters_per_sec": 0.9668723384410952, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342627048492432, "data/tokens_consumed": 27097300992, "data/tokens_consumed_B": 27.097300992, "train/loss_slope": -4.610646268179071e-06} {"step": 12930, "timestamp": 1778339686.8160558, "train/loss": 2.3558878898620605, "train/z_loss": 0.0013734681182540954, "train/perplexity": 10.547489714565076, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024821.6714509747, "perf/iters_per_sec": 0.9655102116827844, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357218265533448, "data/tokens_consumed": 27118272512, "data/tokens_consumed_B": 27.118272512, "train/loss_slope": -6.890213586101649e-06} {"step": 12940, "timestamp": 1778339697.170812, "train/loss": 2.314907193183899, "train/z_loss": 0.0013671376393176616, "train/perplexity": 10.123983300486113, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026625.8453966714, "perf/iters_per_sec": 0.9663705088599546, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347997903823853, "data/tokens_consumed": 27139244032, "data/tokens_consumed_B": 27.139244032, "train/loss_slope": -1.066713891085152e-05} {"step": 12950, "timestamp": 1778339707.5211518, "grad/layer_0/attn": 0.002522626193240285, "grad/layer_0/mlp": 0.0031709247268736362, "grad/layer_0/attn_mlp_ratio": 0.7955490372590953, "grad/layer_4/attn": 0.001793081290088594, "grad/layer_4/mlp": 0.002512805163860321, "grad/layer_4/attn_mlp_ratio": 0.7135774967830001, "grad/layer_8/attn": 0.0025227435398846865, "grad/layer_8/mlp": 0.0032484354451298714, "grad/layer_8/attn_mlp_ratio": 0.7766026152702463, "grad/layer_12/attn": 0.007609670516103506, "grad/layer_12/mlp": 0.006131419911980629, "grad/layer_12/attn_mlp_ratio": 1.2410943144058662, "grad/layer_16/attn": 0.003293388755992055, "grad/layer_16/mlp": 0.00463786581531167, "grad/layer_16/attn_mlp_ratio": 0.7101086612096968, "grad/layer_20/attn": 0.003860416356474161, "grad/layer_20/mlp": 0.0053347875364124775, "grad/layer_20/attn_mlp_ratio": 0.7236307458847984, "grad/layer_24/attn": 0.006494551431387663, "grad/layer_24/mlp": 0.008155634626746178, "grad/layer_24/attn_mlp_ratio": 0.7963269137212536, "grad/layer_27/attn": 0.005419488064944744, "grad/layer_27/mlp": 0.009775262326002121, "grad/layer_27/attn_mlp_ratio": 0.5544084474426946} {"step": 12950, "timestamp": 1778339707.5372357, "train/loss": 2.355220890045166, "train/z_loss": 0.0013701444724574686, "train/perplexity": 10.540456886564995, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024546.054485031, "perf/iters_per_sec": 0.9653787872719912, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358628273010253, "data/tokens_consumed": 27160215552, "data/tokens_consumed_B": 27.160215552, "train/loss_slope": -9.277256465766487e-06} {"step": 12960, "timestamp": 1778339717.905728, "train/loss": 2.328508710861206, "train/z_loss": 0.001360536424908787, "train/perplexity": 10.262625573566556, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024303.2158298653, "perf/iters_per_sec": 0.9652629927777602, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035987091064453, "data/tokens_consumed": 27181187072, "data/tokens_consumed_B": 27.181187072, "train/loss_slope": -1.0004816432990612e-05} {"step": 12970, "timestamp": 1778339728.2609184, "train/loss": 2.3182055234909056, "train/z_loss": 0.0013688726699911057, "train/perplexity": 10.157430671349465, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026838.0900253153, "perf/iters_per_sec": 0.966471714985521, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346914291381837, "data/tokens_consumed": 27202158592, "data/tokens_consumed_B": 27.202158592, "train/loss_slope": -1.2292286514913984e-05} {"step": 12975, "timestamp": 1778339734.0362816, "eos/sharpness": 65.41440486907958, "eos/L0_probe": 2.3104166984558105, "eos/L_plus": 2.594935655593872, "eos/L_minus": 2.680041790008545, "eos/grad_norm": 0.2074507176876068, "eos/embed_grad_frac": 0.059883251786231995, "eos/time_s": 0.6086430549621582} {"step": 12975, "timestamp": 1778339735.416465, "geo/rankme_last": 429.6094970703125, "geo/layer_0/stable_rank_q_proj": 20.623004913330078, "geo/layer_0/stable_rank_k_proj": 16.95781898498535, "geo/layer_0/stable_rank_o_proj": 43.821876525878906, "geo/layer_0/stable_rank_gate_proj": 125.1052474975586, "geo/layer_0/stable_rank_down_proj": 57.683834075927734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06800613552331924, "geo/layer_0/attn_entropy_mean": 6.233590126037598, "geo/layer_0/attn_entropy_std": 0.45708325505256653, "geo/layer_7/stable_rank_q_proj": 41.80685043334961, "geo/layer_7/stable_rank_k_proj": 38.90849685668945, "geo/layer_7/stable_rank_o_proj": 88.35002136230469, "geo/layer_7/stable_rank_gate_proj": 78.26079559326172, "geo/layer_7/stable_rank_down_proj": 144.59579467773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40183892846107483, "geo/layer_7/attn_entropy_mean": 4.734737396240234, "geo/layer_7/attn_entropy_std": 0.7458856701850891, "geo/layer_14/stable_rank_q_proj": 51.84466552734375, "geo/layer_14/stable_rank_k_proj": 43.42570114135742, "geo/layer_14/stable_rank_o_proj": 42.2808723449707, "geo/layer_14/stable_rank_gate_proj": 71.79308319091797, "geo/layer_14/stable_rank_down_proj": 127.38919067382812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37494924664497375, "geo/layer_14/attn_entropy_mean": 5.545279026031494, "geo/layer_14/attn_entropy_std": 0.4711344242095947, "geo/layer_21/stable_rank_q_proj": 38.30750274658203, "geo/layer_21/stable_rank_k_proj": 28.626514434814453, "geo/layer_21/stable_rank_o_proj": 65.13455963134766, "geo/layer_21/stable_rank_gate_proj": 59.88165283203125, "geo/layer_21/stable_rank_down_proj": 48.73686981201172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1376378834247589, "geo/layer_21/attn_entropy_mean": 5.843373775482178, "geo/layer_21/attn_entropy_std": 0.332601934671402, "geo/layer_27/stable_rank_q_proj": 44.75471878051758, "geo/layer_27/stable_rank_k_proj": 30.430049896240234, "geo/layer_27/stable_rank_o_proj": 106.74622344970703, "geo/layer_27/stable_rank_gate_proj": 69.8398666381836, "geo/layer_27/stable_rank_down_proj": 130.29200744628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10297069698572159, "geo/layer_27/attn_entropy_mean": 4.317837715148926, "geo/layer_27/attn_entropy_std": 0.69074946641922, "attnres/final_alpha/block_0": 0.26176586747169495, "attnres/block_norm/0": 1.782257080078125, "attnres/final_alpha/block_1": 0.003782034618780017, "attnres/block_norm/1": 50695.203125, "attnres/final_alpha/block_2": 0.00824250839650631, "attnres/block_norm/2": 30134.57421875, "attnres/final_alpha/block_3": 0.010548979043960571, "attnres/block_norm/3": 73077.1796875, "attnres/final_alpha/block_4": 0.011879677884280682, "attnres/block_norm/4": 17574.765625, "attnres/final_alpha/block_5": 0.6050859093666077, "attnres/block_norm/5": 7216.4091796875, "attnres/final_alpha/block_6": 0.09869503974914551, "attnres/block_norm/6": 48889.18359375, "geo/tier1_time_s": 1.3604552745819092, "geo/step": 12975.0, "geo/rankme_slope": 0.00012357493778761505} {"step": 12980, "timestamp": 1778339740.6007535, "train/loss": 2.293332266807556, "train/z_loss": 0.0013630294590257109, "train/perplexity": 9.90789849484778, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700409.3025571269, "perf/iters_per_sec": 0.8108183396134981, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2333218812942506, "data/tokens_consumed": 27223130112, "data/tokens_consumed_B": 27.223130112, "train/loss_slope": -1.4015584416908796e-05} {"step": 12990, "timestamp": 1778339750.9696038, "train/loss": 2.3308247327804565, "train/z_loss": 0.001355572952888906, "train/perplexity": 10.286421584749009, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023882.346541135, "perf/iters_per_sec": 0.9650623066621471, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362025260925294, "data/tokens_consumed": 27244101632, "data/tokens_consumed_B": 27.244101632, "train/loss_slope": -1.4128464691065967e-05} {"step": 13000, "timestamp": 1778339761.301937, "grad/layer_0/attn": 0.004287668503820896, "grad/layer_0/mlp": 0.0038785580545663834, "grad/layer_0/attn_mlp_ratio": 1.1054800090525523, "grad/layer_4/attn": 0.0016557961935177445, "grad/layer_4/mlp": 0.0026836905162781477, "grad/layer_4/attn_mlp_ratio": 0.6169847535607776, "grad/layer_8/attn": 0.005004142411053181, "grad/layer_8/mlp": 0.003664059331640601, "grad/layer_8/attn_mlp_ratio": 1.3657372388232663, "grad/layer_12/attn": 0.006089234258979559, "grad/layer_12/mlp": 0.007582284510135651, "grad/layer_12/attn_mlp_ratio": 0.8030870076862784, "grad/layer_16/attn": 0.004085463937371969, "grad/layer_16/mlp": 0.005511673167347908, "grad/layer_16/attn_mlp_ratio": 0.7412384114956441, "grad/layer_20/attn": 0.0032333109993487597, "grad/layer_20/mlp": 0.007115502376109362, "grad/layer_20/attn_mlp_ratio": 0.45440374874504724, "grad/layer_24/attn": 0.012679426930844784, "grad/layer_24/mlp": 0.012036889791488647, "grad/layer_24/attn_mlp_ratio": 1.0533806527390832, "grad/layer_27/attn": 0.006147806067019701, "grad/layer_27/mlp": 0.012579317204654217, "grad/layer_27/attn_mlp_ratio": 0.4887233478676205} {"step": 13000, "timestamp": 1778339761.3176024, "train/loss": 2.3336352109909058, "train/z_loss": 0.0013540187734179198, "train/perplexity": 10.315372011692391, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027539.8176413646, "perf/iters_per_sec": 0.9668063247877906, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343333244323731, "data/tokens_consumed": 27265073152, "data/tokens_consumed_B": 27.265073152, "train/loss_slope": -1.2874892063409829e-05} {"step": 13000, "timestamp": 1778339768.3578875, "geo/ww_alpha_mean": 7.419415717320069, "geo/ww_alpha_std": 4.373090281715853, "geo/ww_alpha_min": 1.3511324873267658, "geo/ww_alpha_max": 32.731815623055084, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.846502351927199, "geo/ww_alpha_by_type/k_proj": 4.44474153802445, "geo/ww_alpha_by_type/v_proj": 8.3218303251526, "geo/ww_alpha_by_type/o_proj": 7.800390337866484, "geo/ww_alpha_by_type/gate_proj": 7.775266206121197, "geo/ww_alpha_by_type/up_proj": 11.934787678574756, "geo/ww_alpha_by_type/down_proj": 7.9014245274514625, "geo/twonn_id/layer_0": 0.7858629822731018, "geo/twonn_id/layer_7": 3.465700149536133, "geo/twonn_id/layer_14": 4.883476257324219, "geo/twonn_id/layer_21": 8.25164794921875, "geo/twonn_id/layer_27": 6.122923374176025, "geo/tier2_time_s": 7.031819105148315} {"step": 13000, "timestamp": 1778339769.1124072, "eoc/jacobian_sigma/layer_0/attn": 1427.98681640625, "eoc/jacobian_sigma/layer_0/mlp": 10882.939453125, "eoc/jacobian_sigma/layer_0": 10882.939453125, "eoc/jacobian_sigma/layer_7/attn": 1.1647183895111084, "eoc/jacobian_sigma/layer_7/mlp": 1.8288551568984985, "eoc/jacobian_sigma/layer_7": 1.8288551568984985, "eoc/jacobian_sigma/layer_14/attn": 2.0998618602752686, "eoc/jacobian_sigma/layer_14/mlp": 13.615253448486328, "eoc/jacobian_sigma/layer_14": 13.615253448486328, "eoc/jacobian_sigma/layer_21/attn": 1.0937625169754028, "eoc/jacobian_sigma/layer_21/mlp": 5.222695350646973, "eoc/jacobian_sigma/layer_21": 5.222695350646973, "eoc/jacobian_sigma/layer_27/attn": 4.081902503967285, "eoc/jacobian_sigma/layer_27/mlp": 37.57072830200195, "eoc/jacobian_sigma/layer_27": 37.57072830200195, "eoc/layer0_sigma": 10882.939453125, "eoc/sigma_max": 37.57072830200195, "eoc/sigma_min": 1.8288551568984985, "eoc/sigma_mean": 14.559383064508438, "eoc/time_s": 0.7483892440795898} {"step": 13010, "timestamp": 1778339779.47744, "train/loss": 2.3461914539337156, "train/z_loss": 0.0013651405228301884, "train/perplexity": 10.445710899664833, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1155293.6987562035, "perf/iters_per_sec": 0.5508869642048853, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8152544260025025, "data/tokens_consumed": 27286044672, "data/tokens_consumed_B": 27.286044672, "train/loss_slope": -8.728783330222075e-06} {"step": 13020, "timestamp": 1778339789.8391182, "train/loss": 2.314772891998291, "train/z_loss": 0.0013627850683405995, "train/perplexity": 10.122623728823868, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024953.2614393514, "perf/iters_per_sec": 0.9655729586788899, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356545209884644, "data/tokens_consumed": 27307016192, "data/tokens_consumed_B": 27.307016192, "train/loss_slope": -9.785820849121373e-06} {"step": 13030, "timestamp": 1778339800.1963727, "train/loss": 2.3316683530807496, "train/z_loss": 0.0013646342791616916, "train/perplexity": 10.295103080243312, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026781.8140164397, "perf/iters_per_sec": 0.9664448804933737, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347201585769654, "data/tokens_consumed": 27327987712, "data/tokens_consumed_B": 27.327987712, "train/loss_slope": -8.89261625137124e-06} {"step": 13040, "timestamp": 1778339810.544248, "train/loss": 2.321653366088867, "train/z_loss": 0.001368693565018475, "train/perplexity": 10.19251233678111, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027951.783260634, "perf/iters_per_sec": 0.9670027653029604, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341232061386108, "data/tokens_consumed": 27348959232, "data/tokens_consumed_B": 27.348959232, "train/loss_slope": -1.2217481294409344e-05} {"step": 13050, "timestamp": 1778339820.8981044, "grad/layer_0/attn": 0.002886370988562703, "grad/layer_0/mlp": 0.0032306346110999584, "grad/layer_0/attn_mlp_ratio": 0.8934377441824609, "grad/layer_4/attn": 0.002633832162246108, "grad/layer_4/mlp": 0.002647919114679098, "grad/layer_4/attn_mlp_ratio": 0.9946799538464404, "grad/layer_8/attn": 0.005074793938547373, "grad/layer_8/mlp": 0.0035656699910759926, "grad/layer_8/attn_mlp_ratio": 1.423237093989248, "grad/layer_12/attn": 0.011898772791028023, "grad/layer_12/mlp": 0.007117360830307007, "grad/layer_12/attn_mlp_ratio": 1.671795614630263, "grad/layer_16/attn": 0.0047035315074026585, "grad/layer_16/mlp": 0.004586371127516031, "grad/layer_16/attn_mlp_ratio": 1.0255453111130042, "grad/layer_20/attn": 0.004122756887227297, "grad/layer_20/mlp": 0.007120971102267504, "grad/layer_20/attn_mlp_ratio": 0.5789599157365646, "grad/layer_24/attn": 0.008959326893091202, "grad/layer_24/mlp": 0.011333750560879707, "grad/layer_24/attn_mlp_ratio": 0.7904997349214486, "grad/layer_27/attn": 0.011261873878538609, "grad/layer_27/mlp": 0.012183409184217453, "grad/layer_27/attn_mlp_ratio": 0.9243614505446671} {"step": 13050, "timestamp": 1778339821.5137556, "eos/sharpness": 31.370592117309563, "eos/L0_probe": 2.308173418045044, "eos/L_plus": 2.4836983680725098, "eos/L_minus": 2.446354389190674, "eos/grad_norm": 0.15612030029296875, "eos/embed_grad_frac": 0.10668399930000305, "eos/time_s": 0.6129541397094727} {"step": 13050, "timestamp": 1778339821.5351646, "train/loss": 2.304743266105652, "train/z_loss": 0.0013787442701868712, "train/perplexity": 10.021605036434583, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908960.3564188352, "perf/iters_per_sec": 0.910263231477182, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0985833168029786, "data/tokens_consumed": 27369930752, "data/tokens_consumed_B": 27.369930752, "train/loss_slope": -1.420336895578442e-05} {"step": 13050, "timestamp": 1778339822.90099, "geo/rankme_last": 430.8243713378906, "geo/layer_0/stable_rank_q_proj": 20.63685417175293, "geo/layer_0/stable_rank_k_proj": 16.92390251159668, "geo/layer_0/stable_rank_o_proj": 43.82817840576172, "geo/layer_0/stable_rank_gate_proj": 124.94056701660156, "geo/layer_0/stable_rank_down_proj": 57.70703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06569892913103104, "geo/layer_0/attn_entropy_mean": 6.236544609069824, "geo/layer_0/attn_entropy_std": 0.46231144666671753, "geo/layer_7/stable_rank_q_proj": 41.75349426269531, "geo/layer_7/stable_rank_k_proj": 38.77388381958008, "geo/layer_7/stable_rank_o_proj": 88.44822692871094, "geo/layer_7/stable_rank_gate_proj": 78.24800872802734, "geo/layer_7/stable_rank_down_proj": 144.34786987304688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39361461997032166, "geo/layer_7/attn_entropy_mean": 4.731156349182129, "geo/layer_7/attn_entropy_std": 0.7580399513244629, "geo/layer_14/stable_rank_q_proj": 51.888545989990234, "geo/layer_14/stable_rank_k_proj": 43.50832748413086, "geo/layer_14/stable_rank_o_proj": 42.320404052734375, "geo/layer_14/stable_rank_gate_proj": 71.78556823730469, "geo/layer_14/stable_rank_down_proj": 127.69523620605469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3723556399345398, "geo/layer_14/attn_entropy_mean": 5.5228729248046875, "geo/layer_14/attn_entropy_std": 0.47185900807380676, "geo/layer_21/stable_rank_q_proj": 38.32374572753906, "geo/layer_21/stable_rank_k_proj": 28.609647750854492, "geo/layer_21/stable_rank_o_proj": 65.15596771240234, "geo/layer_21/stable_rank_gate_proj": 59.84600830078125, "geo/layer_21/stable_rank_down_proj": 48.71015167236328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1363372802734375, "geo/layer_21/attn_entropy_mean": 5.839851379394531, "geo/layer_21/attn_entropy_std": 0.3167061507701874, "geo/layer_27/stable_rank_q_proj": 44.70856857299805, "geo/layer_27/stable_rank_k_proj": 30.445634841918945, "geo/layer_27/stable_rank_o_proj": 106.83783721923828, "geo/layer_27/stable_rank_gate_proj": 69.8538589477539, "geo/layer_27/stable_rank_down_proj": 130.6421661376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09647386521100998, "geo/layer_27/attn_entropy_mean": 4.2960524559021, "geo/layer_27/attn_entropy_std": 0.6701568961143494, "attnres/final_alpha/block_0": 0.2606567144393921, "attnres/block_norm/0": 1.7823691368103027, "attnres/final_alpha/block_1": 0.0037732429336756468, "attnres/block_norm/1": 50672.68359375, "attnres/final_alpha/block_2": 0.00818735919892788, "attnres/block_norm/2": 29982.15625, "attnres/final_alpha/block_3": 0.010494500398635864, "attnres/block_norm/3": 72462.09375, "attnres/final_alpha/block_4": 0.011721535585820675, "attnres/block_norm/4": 17623.064453125, "attnres/final_alpha/block_5": 0.6069629192352295, "attnres/block_norm/5": 7181.509765625, "attnres/final_alpha/block_6": 0.09820373356342316, "attnres/block_norm/6": 48832.328125, "geo/tier1_time_s": 1.3615748882293701, "geo/step": 13050.0, "geo/rankme_slope": 0.00015792797587785114} {"step": 13060, "timestamp": 1778339833.258265, "train/loss": 2.356304335594177, "train/z_loss": 0.0013600701582618057, "train/perplexity": 10.551883086378226, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789465.6197077078, "perf/iters_per_sec": 0.8532837008036174, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1719431638717652, "data/tokens_consumed": 27390902272, "data/tokens_consumed_B": 27.390902272, "train/loss_slope": -1.2933156656997178e-05} {"step": 13070, "timestamp": 1778339843.613462, "train/loss": 2.315979528427124, "train/z_loss": 0.0013522379449568689, "train/perplexity": 10.134845427460965, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026949.9507907568, "perf/iters_per_sec": 0.9665250543550286, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346343278884889, "data/tokens_consumed": 27411873792, "data/tokens_consumed_B": 27.411873792, "train/loss_slope": -1.4937232587203738e-05} {"step": 13080, "timestamp": 1778339853.9858208, "train/loss": 2.31836473941803, "train/z_loss": 0.001364777737762779, "train/perplexity": 10.159048024841804, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022788.1886075041, "perf/iters_per_sec": 0.9645405715024491, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367630243301391, "data/tokens_consumed": 27432845312, "data/tokens_consumed_B": 27.432845312, "train/loss_slope": -1.671094980248454e-05} {"step": 13090, "timestamp": 1778339864.345218, "train/loss": 2.333406138420105, "train/z_loss": 0.0013569816132076085, "train/perplexity": 10.313009313531905, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025910.0053141573, "perf/iters_per_sec": 0.9660291697092807, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351654291152954, "data/tokens_consumed": 27453816832, "data/tokens_consumed_B": 27.453816832, "train/loss_slope": -1.67071094106634e-05} {"step": 13100, "timestamp": 1778339874.6867816, "grad/layer_0/attn": 0.002985656028613448, "grad/layer_0/mlp": 0.0033093257807195187, "grad/layer_0/attn_mlp_ratio": 0.9021946270109555, "grad/layer_4/attn": 0.002075671451166272, "grad/layer_4/mlp": 0.0026198907289654016, "grad/layer_4/attn_mlp_ratio": 0.792274024634056, "grad/layer_8/attn": 0.005075044929981232, "grad/layer_8/mlp": 0.0033776697237044573, "grad/layer_8/attn_mlp_ratio": 1.5025284278423576, "grad/layer_12/attn": 0.008782688528299332, "grad/layer_12/mlp": 0.007308405824005604, "grad/layer_12/attn_mlp_ratio": 1.201724236396232, "grad/layer_16/attn": 0.004238657653331757, "grad/layer_16/mlp": 0.005371462553739548, "grad/layer_16/attn_mlp_ratio": 0.7891067901925846, "grad/layer_20/attn": 0.005754394456744194, "grad/layer_20/mlp": 0.006960402708500624, "grad/layer_20/attn_mlp_ratio": 0.8267329657583105, "grad/layer_24/attn": 0.012206530198454857, "grad/layer_24/mlp": 0.011665662750601768, "grad/layer_24/attn_mlp_ratio": 1.0463640476139071, "grad/layer_27/attn": 0.00958835706114769, "grad/layer_27/mlp": 0.012211565859615803, "grad/layer_27/attn_mlp_ratio": 0.7851865266794462} {"step": 13100, "timestamp": 1778339874.7026556, "train/loss": 2.30314679145813, "train/z_loss": 0.0013647885876707732, "train/perplexity": 10.005618562462073, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026296.288887269, "perf/iters_per_sec": 0.9662133640705438, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034968090057373, "data/tokens_consumed": 27474788352, "data/tokens_consumed_B": 27.474788352, "train/loss_slope": -1.4002199627921817e-05} {"step": 13110, "timestamp": 1778339885.0529892, "train/loss": 2.3394902467727663, "train/z_loss": 0.0013653222937136888, "train/perplexity": 10.375946042435775, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027162.7304975572, "perf/iters_per_sec": 0.9666265156257425, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034525728225708, "data/tokens_consumed": 27495759872, "data/tokens_consumed_B": 27.495759872, "train/loss_slope": -1.2880725471457697e-05} {"step": 13120, "timestamp": 1778339895.405069, "train/loss": 2.3087722778320314, "train/z_loss": 0.0013577714911662043, "train/perplexity": 10.062063650027664, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027156.0965148401, "perf/iters_per_sec": 0.9666233522962762, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345291137695312, "data/tokens_consumed": 27516731392, "data/tokens_consumed_B": 27.516731392, "train/loss_slope": -1.3954156746279195e-05} {"step": 13125, "timestamp": 1778339901.1744723, "eos/sharpness": 52.08075046539306, "eos/L0_probe": 2.308319330215454, "eos/L_plus": 2.5721333026885986, "eos/L_minus": 2.5653128623962402, "eos/grad_norm": 0.18063125014305115, "eos/embed_grad_frac": 0.0750880017876625, "eos/time_s": 0.6033575534820557} {"step": 13125, "timestamp": 1778339902.556517, "geo/rankme_last": 429.4170227050781, "geo/layer_0/stable_rank_q_proj": 20.65376853942871, "geo/layer_0/stable_rank_k_proj": 16.922924041748047, "geo/layer_0/stable_rank_o_proj": 43.83372116088867, "geo/layer_0/stable_rank_gate_proj": 124.86138916015625, "geo/layer_0/stable_rank_down_proj": 57.70627975463867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06466521322727203, "geo/layer_0/attn_entropy_mean": 6.233283519744873, "geo/layer_0/attn_entropy_std": 0.4601588249206543, "geo/layer_7/stable_rank_q_proj": 41.78913116455078, "geo/layer_7/stable_rank_k_proj": 38.85686111450195, "geo/layer_7/stable_rank_o_proj": 88.51958465576172, "geo/layer_7/stable_rank_gate_proj": 78.43897247314453, "geo/layer_7/stable_rank_down_proj": 144.3347625732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39219996333122253, "geo/layer_7/attn_entropy_mean": 4.756608963012695, "geo/layer_7/attn_entropy_std": 0.7527448534965515, "geo/layer_14/stable_rank_q_proj": 51.87411117553711, "geo/layer_14/stable_rank_k_proj": 43.63275909423828, "geo/layer_14/stable_rank_o_proj": 42.298213958740234, "geo/layer_14/stable_rank_gate_proj": 71.67085266113281, "geo/layer_14/stable_rank_down_proj": 127.5311050415039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38337332010269165, "geo/layer_14/attn_entropy_mean": 5.513816833496094, "geo/layer_14/attn_entropy_std": 0.5065745711326599, "geo/layer_21/stable_rank_q_proj": 38.25265884399414, "geo/layer_21/stable_rank_k_proj": 28.658222198486328, "geo/layer_21/stable_rank_o_proj": 65.1283187866211, "geo/layer_21/stable_rank_gate_proj": 59.75248336791992, "geo/layer_21/stable_rank_down_proj": 48.692527770996094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13707107305526733, "geo/layer_21/attn_entropy_mean": 5.841361999511719, "geo/layer_21/attn_entropy_std": 0.33578306436538696, "geo/layer_27/stable_rank_q_proj": 44.77641296386719, "geo/layer_27/stable_rank_k_proj": 30.41141128540039, "geo/layer_27/stable_rank_o_proj": 107.008544921875, "geo/layer_27/stable_rank_gate_proj": 69.85689544677734, "geo/layer_27/stable_rank_down_proj": 130.69664001464844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10554776340723038, "geo/layer_27/attn_entropy_mean": 4.322605609893799, "geo/layer_27/attn_entropy_std": 0.7000736594200134, "attnres/final_alpha/block_0": 0.2640935778617859, "attnres/block_norm/0": 1.7824842929840088, "attnres/final_alpha/block_1": 0.003777641337364912, "attnres/block_norm/1": 50803.6328125, "attnres/final_alpha/block_2": 0.00819610245525837, "attnres/block_norm/2": 30126.19921875, "attnres/final_alpha/block_3": 0.01046437956392765, "attnres/block_norm/3": 73061.515625, "attnres/final_alpha/block_4": 0.011849215254187584, "attnres/block_norm/4": 17583.033203125, "attnres/final_alpha/block_5": 0.6013549566268921, "attnres/block_norm/5": 7284.95458984375, "attnres/final_alpha/block_6": 0.10026412457227707, "attnres/block_norm/6": 48906.6015625, "geo/tier1_time_s": 1.3620986938476562, "geo/step": 13125.0, "geo/rankme_slope": 0.00012340905112044818} {"step": 13130, "timestamp": 1778339907.7320402, "train/loss": 2.3159525394439697, "train/z_loss": 0.0013657735893502832, "train/perplexity": 10.134571901979557, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702222.7399938384, "perf/iters_per_sec": 0.8116830539673988, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2320079803466797, "data/tokens_consumed": 27537702912, "data/tokens_consumed_B": 27.537702912, "train/loss_slope": -1.2820722189101248e-05} {"step": 13140, "timestamp": 1778339918.08101, "train/loss": 2.350356388092041, "train/z_loss": 0.0013512220117263495, "train/perplexity": 10.48930732289391, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027905.6841572144, "perf/iters_per_sec": 0.9669807835374901, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341467142105103, "data/tokens_consumed": 27558674432, "data/tokens_consumed_B": 27.558674432, "train/loss_slope": -8.760964387607008e-06} {"step": 13150, "timestamp": 1778339928.422095, "grad/layer_0/attn": 0.003024123143404722, "grad/layer_0/mlp": 0.0031153487507253885, "grad/layer_0/attn_mlp_ratio": 0.9707173380279949, "grad/layer_4/attn": 0.0021388002205640078, "grad/layer_4/mlp": 0.002601103624328971, "grad/layer_4/attn_mlp_ratio": 0.8222664096626029, "grad/layer_8/attn": 0.0042324429377913475, "grad/layer_8/mlp": 0.0035023787058889866, "grad/layer_8/attn_mlp_ratio": 1.2084480783959783, "grad/layer_12/attn": 0.005578471347689629, "grad/layer_12/mlp": 0.00629518274217844, "grad/layer_12/attn_mlp_ratio": 0.8861492171940165, "grad/layer_16/attn": 0.005774632561951876, "grad/layer_16/mlp": 0.004515441600233316, "grad/layer_16/attn_mlp_ratio": 1.278863275248022, "grad/layer_20/attn": 0.002897508442401886, "grad/layer_20/mlp": 0.005883452482521534, "grad/layer_20/attn_mlp_ratio": 0.49248437065843914, "grad/layer_24/attn": 0.013044090941548347, "grad/layer_24/mlp": 0.011498645879328251, "grad/layer_24/attn_mlp_ratio": 1.1344023431105217, "grad/layer_27/attn": 0.0070164380595088005, "grad/layer_27/mlp": 0.012020776979625225, "grad/layer_27/attn_mlp_ratio": 0.5836925527386583} {"step": 13150, "timestamp": 1778339928.4378664, "train/loss": 2.3656168460845945, "train/z_loss": 0.0013688843813724817, "train/perplexity": 10.650606576618856, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025852.054484526, "perf/iters_per_sec": 0.9660015366003637, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351950407028199, "data/tokens_consumed": 27579645952, "data/tokens_consumed_B": 27.579645952, "train/loss_slope": -7.304390393110412e-06} {"step": 13160, "timestamp": 1778339938.786015, "train/loss": 2.305759382247925, "train/z_loss": 0.0013734948355704547, "train/perplexity": 10.031793326449952, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027921.4866563785, "perf/iters_per_sec": 0.966988318756284, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341386556625367, "data/tokens_consumed": 27600617472, "data/tokens_consumed_B": 27.600617472, "train/loss_slope": -9.078780302155943e-06} {"step": 13170, "timestamp": 1778339949.1426008, "train/loss": 2.348618507385254, "train/z_loss": 0.0013679519528523088, "train/perplexity": 10.471093988954761, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026348.7567822991, "perf/iters_per_sec": 0.9662383827125068, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349412918090821, "data/tokens_consumed": 27621588992, "data/tokens_consumed_B": 27.621588992, "train/loss_slope": -8.265805165759857e-06} {"step": 13180, "timestamp": 1778339959.5195978, "train/loss": 2.35316162109375, "train/z_loss": 0.0013542157947085797, "train/perplexity": 10.518773584502597, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021950.4867371358, "perf/iters_per_sec": 0.9641411241231612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371925592422486, "data/tokens_consumed": 27642560512, "data/tokens_consumed_B": 27.642560512, "train/loss_slope": -9.34243830266822e-06} {"step": 13190, "timestamp": 1778339969.8757162, "train/loss": 2.2876301050186156, "train/z_loss": 0.0013604352949187159, "train/perplexity": 9.85156282483866, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026650.499916352, "perf/iters_per_sec": 0.966382265051056, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347872018814086, "data/tokens_consumed": 27663532032, "data/tokens_consumed_B": 27.663532032, "train/loss_slope": -1.3537971557814119e-05} {"step": 13200, "timestamp": 1778339980.2122943, "grad/layer_0/attn": 0.003031078726053238, "grad/layer_0/mlp": 0.0032464414834976196, "grad/layer_0/attn_mlp_ratio": 0.9336618719587875, "grad/layer_4/attn": 0.0028274471405893564, "grad/layer_4/mlp": 0.002513184677809477, "grad/layer_4/attn_mlp_ratio": 1.1250454664355378, "grad/layer_8/attn": 0.0038807301316410303, "grad/layer_8/mlp": 0.0033008623868227005, "grad/layer_8/attn_mlp_ratio": 1.1756715546719096, "grad/layer_12/attn": 0.006389460992068052, "grad/layer_12/mlp": 0.006667471956461668, "grad/layer_12/attn_mlp_ratio": 0.9583033776460772, "grad/layer_16/attn": 0.0031546978279948235, "grad/layer_16/mlp": 0.004933038726449013, "grad/layer_16/attn_mlp_ratio": 0.639503952630694, "grad/layer_20/attn": 0.002539001638069749, "grad/layer_20/mlp": 0.0054631950333714485, "grad/layer_20/attn_mlp_ratio": 0.4647466502817152, "grad/layer_24/attn": 0.00452580489218235, "grad/layer_24/mlp": 0.007044088561087847, "grad/layer_24/attn_mlp_ratio": 0.6424968665120993, "grad/layer_27/attn": 0.005123680457472801, "grad/layer_27/mlp": 0.007481560576707125, "grad/layer_27/attn_mlp_ratio": 0.684841128592958} {"step": 13200, "timestamp": 1778339980.8083735, "eos/sharpness": 36.993932723999016, "eos/L0_probe": 2.3064920902252197, "eos/L_plus": 2.531954765319824, "eos/L_minus": 2.4509687423706055, "eos/grad_norm": 0.11186987161636353, "eos/embed_grad_frac": 0.18112213909626007, "eos/time_s": 0.5933237075805664} {"step": 13200, "timestamp": 1778339980.8272936, "train/loss": 2.3470012187957763, "train/z_loss": 0.0013624320505186915, "train/perplexity": 10.454172894961461, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916127.9876762643, "perf/iters_per_sec": 0.9136810243970224, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0944738626480102, "data/tokens_consumed": 27684503552, "data/tokens_consumed_B": 27.684503552, "train/loss_slope": -1.2014282311257332e-05} {"step": 13200, "timestamp": 1778339982.189261, "geo/rankme_last": 429.45751953125, "geo/layer_0/stable_rank_q_proj": 20.659151077270508, "geo/layer_0/stable_rank_k_proj": 16.89690589904785, "geo/layer_0/stable_rank_o_proj": 43.84016418457031, "geo/layer_0/stable_rank_gate_proj": 124.79267883300781, "geo/layer_0/stable_rank_down_proj": 57.80141067504883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06278097629547119, "geo/layer_0/attn_entropy_mean": 6.231714725494385, "geo/layer_0/attn_entropy_std": 0.45782414078712463, "geo/layer_7/stable_rank_q_proj": 41.7152099609375, "geo/layer_7/stable_rank_k_proj": 38.781585693359375, "geo/layer_7/stable_rank_o_proj": 88.53209686279297, "geo/layer_7/stable_rank_gate_proj": 78.37039947509766, "geo/layer_7/stable_rank_down_proj": 144.30430603027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3943294584751129, "geo/layer_7/attn_entropy_mean": 4.725943088531494, "geo/layer_7/attn_entropy_std": 0.7737686634063721, "geo/layer_14/stable_rank_q_proj": 51.84770584106445, "geo/layer_14/stable_rank_k_proj": 43.64496994018555, "geo/layer_14/stable_rank_o_proj": 42.305233001708984, "geo/layer_14/stable_rank_gate_proj": 71.57400512695312, "geo/layer_14/stable_rank_down_proj": 127.58065032958984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37750351428985596, "geo/layer_14/attn_entropy_mean": 5.539231777191162, "geo/layer_14/attn_entropy_std": 0.498084157705307, "geo/layer_21/stable_rank_q_proj": 38.356689453125, "geo/layer_21/stable_rank_k_proj": 28.69150733947754, "geo/layer_21/stable_rank_o_proj": 65.08345794677734, "geo/layer_21/stable_rank_gate_proj": 59.69345474243164, "geo/layer_21/stable_rank_down_proj": 48.661895751953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13600821793079376, "geo/layer_21/attn_entropy_mean": 5.844844818115234, "geo/layer_21/attn_entropy_std": 0.3298019468784332, "geo/layer_27/stable_rank_q_proj": 44.82372283935547, "geo/layer_27/stable_rank_k_proj": 30.41465187072754, "geo/layer_27/stable_rank_o_proj": 106.94343566894531, "geo/layer_27/stable_rank_gate_proj": 69.77483367919922, "geo/layer_27/stable_rank_down_proj": 130.5467071533203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11039429157972336, "geo/layer_27/attn_entropy_mean": 4.312610626220703, "geo/layer_27/attn_entropy_std": 0.6958518624305725, "attnres/final_alpha/block_0": 0.26110947132110596, "attnres/block_norm/0": 1.782294511795044, "attnres/final_alpha/block_1": 0.0037549964617937803, "attnres/block_norm/1": 50877.5234375, "attnres/final_alpha/block_2": 0.008032234385609627, "attnres/block_norm/2": 30201.68359375, "attnres/final_alpha/block_3": 0.010376980528235435, "attnres/block_norm/3": 73171.4453125, "attnres/final_alpha/block_4": 0.011693445965647697, "attnres/block_norm/4": 17495.728515625, "attnres/final_alpha/block_5": 0.6075230240821838, "attnres/block_norm/5": 7188.5107421875, "attnres/final_alpha/block_6": 0.09750981628894806, "attnres/block_norm/6": 49306.98046875, "geo/tier1_time_s": 1.357978343963623, "geo/step": 13200.0, "geo/rankme_slope": 0.0001115569274584834} {"step": 13210, "timestamp": 1778339992.5599275, "train/loss": 2.302031397819519, "train/z_loss": 0.00136921931989491, "train/perplexity": 9.994464580863585, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788033.1157078552, "perf/iters_per_sec": 0.8526006296672131, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.172882080078125, "data/tokens_consumed": 27705475072, "data/tokens_consumed_B": 27.705475072, "train/loss_slope": -1.2856509068188998e-05} {"step": 13220, "timestamp": 1778340002.9145215, "train/loss": 2.313846206665039, "train/z_loss": 0.001354071742389351, "train/perplexity": 10.113247586917911, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026666.1427730187, "perf/iters_per_sec": 0.966389724146375, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347792148590087, "data/tokens_consumed": 27726446592, "data/tokens_consumed_B": 27.726446592, "train/loss_slope": -1.1501143080960085e-05} {"step": 13230, "timestamp": 1778340013.2746522, "train/loss": 2.3241530656814575, "train/z_loss": 0.0013619445031508803, "train/perplexity": 10.218022426213604, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025740.1749031732, "perf/iters_per_sec": 0.9659481882587305, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352522134780884, "data/tokens_consumed": 27747418112, "data/tokens_consumed_B": 27.747418112, "train/loss_slope": -9.541648005781176e-06} {"step": 13240, "timestamp": 1778340023.6481628, "train/loss": 2.281739592552185, "train/z_loss": 0.001381238829344511, "train/perplexity": 9.79370265154443, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022709.857138084, "perf/iters_per_sec": 0.9645032201471729, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03680317401886, "data/tokens_consumed": 27768389632, "data/tokens_consumed_B": 27.768389632, "train/loss_slope": -1.4392912026607687e-05} {"step": 13250, "timestamp": 1778340034.0038319, "grad/layer_0/attn": 0.0028752307407557964, "grad/layer_0/mlp": 0.003292666282504797, "grad/layer_0/attn_mlp_ratio": 0.8732226125407067, "grad/layer_4/attn": 0.0029146831948310137, "grad/layer_4/mlp": 0.0026136122178286314, "grad/layer_4/attn_mlp_ratio": 1.1151933953435411, "grad/layer_8/attn": 0.003373044775798917, "grad/layer_8/mlp": 0.0033997329883277416, "grad/layer_8/attn_mlp_ratio": 0.9921498800536861, "grad/layer_12/attn": 0.005653442349284887, "grad/layer_12/mlp": 0.006770764943212271, "grad/layer_12/attn_mlp_ratio": 0.8349783685009857, "grad/layer_16/attn": 0.004453275352716446, "grad/layer_16/mlp": 0.004350305534899235, "grad/layer_16/attn_mlp_ratio": 1.0236695364553614, "grad/layer_20/attn": 0.003884358098730445, "grad/layer_20/mlp": 0.005609870422631502, "grad/layer_20/attn_mlp_ratio": 0.6924149288401675, "grad/layer_24/attn": 0.006241422146558762, "grad/layer_24/mlp": 0.008621631190180779, "grad/layer_24/attn_mlp_ratio": 0.7239258948207575, "grad/layer_27/attn": 0.00626411521807313, "grad/layer_27/mlp": 0.008987459354102612, "grad/layer_27/attn_mlp_ratio": 0.6969839752894434} {"step": 13250, "timestamp": 1778340034.0194714, "train/loss": 2.349995183944702, "train/z_loss": 0.0013634640839882196, "train/perplexity": 10.485519225765552, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023345.4287766593, "perf/iters_per_sec": 0.9648062843211457, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364774942398072, "data/tokens_consumed": 27789361152, "data/tokens_consumed_B": 27.789361152, "train/loss_slope": -1.3779277941717874e-05} {"step": 13260, "timestamp": 1778340044.373134, "train/loss": 2.3518796443939207, "train/z_loss": 0.001364438538439572, "train/perplexity": 10.50529740177825, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027045.100540927, "perf/iters_per_sec": 0.9665704252915034, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345857620239258, "data/tokens_consumed": 27810332672, "data/tokens_consumed_B": 27.810332672, "train/loss_slope": -1.2513194325948154e-05} {"step": 13270, "timestamp": 1778340054.7358618, "train/loss": 2.3310984134674073, "train/z_loss": 0.0013693175744265318, "train/perplexity": 10.289237164941975, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024670.4782844419, "perf/iters_per_sec": 0.9654381171629152, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357991695404052, "data/tokens_consumed": 27831304192, "data/tokens_consumed_B": 27.831304192, "train/loss_slope": -1.2134788417615913e-05} {"step": 13275, "timestamp": 1778340060.5054626, "eos/sharpness": 18.818712234497067, "eos/L0_probe": 2.3115806579589844, "eos/L_plus": 2.4437994956970215, "eos/L_minus": 2.367548942565918, "eos/grad_norm": 0.12051891535520554, "eos/embed_grad_frac": 0.18748298287391663, "eos/time_s": 0.5987012386322021} {"step": 13275, "timestamp": 1778340061.8859096, "geo/rankme_last": 430.7697448730469, "geo/layer_0/stable_rank_q_proj": 20.68345832824707, "geo/layer_0/stable_rank_k_proj": 16.895090103149414, "geo/layer_0/stable_rank_o_proj": 43.83842849731445, "geo/layer_0/stable_rank_gate_proj": 124.64060974121094, "geo/layer_0/stable_rank_down_proj": 57.78489685058594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06107358634471893, "geo/layer_0/attn_entropy_mean": 6.2346510887146, "geo/layer_0/attn_entropy_std": 0.45646825432777405, "geo/layer_7/stable_rank_q_proj": 41.69110870361328, "geo/layer_7/stable_rank_k_proj": 38.744529724121094, "geo/layer_7/stable_rank_o_proj": 88.40087890625, "geo/layer_7/stable_rank_gate_proj": 78.39376068115234, "geo/layer_7/stable_rank_down_proj": 144.18341064453125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39036765694618225, "geo/layer_7/attn_entropy_mean": 4.73365592956543, "geo/layer_7/attn_entropy_std": 0.7436981797218323, "geo/layer_14/stable_rank_q_proj": 51.843017578125, "geo/layer_14/stable_rank_k_proj": 43.70753479003906, "geo/layer_14/stable_rank_o_proj": 42.32020568847656, "geo/layer_14/stable_rank_gate_proj": 71.60852813720703, "geo/layer_14/stable_rank_down_proj": 127.52579498291016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3638746738433838, "geo/layer_14/attn_entropy_mean": 5.485115051269531, "geo/layer_14/attn_entropy_std": 0.4854389429092407, "geo/layer_21/stable_rank_q_proj": 38.40404510498047, "geo/layer_21/stable_rank_k_proj": 28.68589973449707, "geo/layer_21/stable_rank_o_proj": 65.08528900146484, "geo/layer_21/stable_rank_gate_proj": 59.662071228027344, "geo/layer_21/stable_rank_down_proj": 48.68008804321289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1339770257472992, "geo/layer_21/attn_entropy_mean": 5.860176086425781, "geo/layer_21/attn_entropy_std": 0.330540269613266, "geo/layer_27/stable_rank_q_proj": 44.915679931640625, "geo/layer_27/stable_rank_k_proj": 30.454212188720703, "geo/layer_27/stable_rank_o_proj": 106.90205383300781, "geo/layer_27/stable_rank_gate_proj": 69.71770477294922, "geo/layer_27/stable_rank_down_proj": 130.32720947265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10134849697351456, "geo/layer_27/attn_entropy_mean": 4.32020378112793, "geo/layer_27/attn_entropy_std": 0.6671097278594971, "attnres/final_alpha/block_0": 0.26119983196258545, "attnres/block_norm/0": 1.7823289632797241, "attnres/final_alpha/block_1": 0.0037513149436563253, "attnres/block_norm/1": 50896.78125, "attnres/final_alpha/block_2": 0.00804988294839859, "attnres/block_norm/2": 30043.841796875, "attnres/final_alpha/block_3": 0.010491788387298584, "attnres/block_norm/3": 73092.953125, "attnres/final_alpha/block_4": 0.011963346973061562, "attnres/block_norm/4": 17550.34375, "attnres/final_alpha/block_5": 0.6065733432769775, "attnres/block_norm/5": 7190.4365234375, "attnres/final_alpha/block_6": 0.09797050803899765, "attnres/block_norm/6": 49012.8203125, "geo/tier1_time_s": 1.3588848114013672, "geo/step": 13275.0, "geo/rankme_slope": 0.00017060630892982192} {"step": 13280, "timestamp": 1778340067.0670729, "train/loss": 2.296156644821167, "train/z_loss": 0.001375188084784895, "train/perplexity": 9.935921700952631, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701614.7588811656, "perf/iters_per_sec": 0.8113931459813908, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2324481725692749, "data/tokens_consumed": 27852275712, "data/tokens_consumed_B": 27.852275712, "train/loss_slope": -1.67978470582273e-05} {"step": 13290, "timestamp": 1778340077.4202816, "train/loss": 2.3269005537033083, "train/z_loss": 0.001356083492282778, "train/perplexity": 10.246134922125144, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026571.3087418096, "perf/iters_per_sec": 0.9663445037564323, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348276376724244, "data/tokens_consumed": 27873247232, "data/tokens_consumed_B": 27.873247232, "train/loss_slope": -1.8585475629205118e-05} {"step": 13300, "timestamp": 1778340087.7759485, "grad/layer_0/attn": 0.0033901194110512733, "grad/layer_0/mlp": 0.003497007302939892, "grad/layer_0/attn_mlp_ratio": 0.9694344393441207, "grad/layer_4/attn": 0.0017143324948847294, "grad/layer_4/mlp": 0.002645510248839855, "grad/layer_4/attn_mlp_ratio": 0.648015796134202, "grad/layer_8/attn": 0.004330974537879229, "grad/layer_8/mlp": 0.003587867133319378, "grad/layer_8/attn_mlp_ratio": 1.2071167231771707, "grad/layer_12/attn": 0.01018114946782589, "grad/layer_12/mlp": 0.006890339311212301, "grad/layer_12/attn_mlp_ratio": 1.4775976712059524, "grad/layer_16/attn": 0.006093773990869522, "grad/layer_16/mlp": 0.004751849453896284, "grad/layer_16/attn_mlp_ratio": 1.2824004467634977, "grad/layer_20/attn": 0.004771133419126272, "grad/layer_20/mlp": 0.005685006733983755, "grad/layer_20/attn_mlp_ratio": 0.8392484931777843, "grad/layer_24/attn": 0.0085291163995862, "grad/layer_24/mlp": 0.009503303095698357, "grad/layer_24/attn_mlp_ratio": 0.8974896647985389, "grad/layer_27/attn": 0.008141824044287205, "grad/layer_27/mlp": 0.010419958271086216, "grad/layer_27/attn_mlp_ratio": 0.7813681930706667} {"step": 13300, "timestamp": 1778340087.7915182, "train/loss": 2.343402099609375, "train/z_loss": 0.0013620788464322685, "train/perplexity": 10.41661470945327, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023519.7456741396, "perf/iters_per_sec": 0.964889405095167, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363882064819336, "data/tokens_consumed": 27894218752, "data/tokens_consumed_B": 27.894218752, "train/loss_slope": -1.5998586777127164e-05} {"step": 13310, "timestamp": 1778340098.1500533, "train/loss": 2.353733992576599, "train/z_loss": 0.0013675820431672038, "train/perplexity": 10.524795953888749, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026286.5331458708, "perf/iters_per_sec": 0.9662087121705393, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349730730056763, "data/tokens_consumed": 27915190272, "data/tokens_consumed_B": 27.915190272, "train/loss_slope": -1.2437044261562737e-05} {"step": 13320, "timestamp": 1778340108.5124426, "train/loss": 2.3183383703231812, "train/z_loss": 0.0013718018308281898, "train/perplexity": 10.158780143472775, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025248.3872513943, "perf/iters_per_sec": 0.9657136856324169, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355036020278932, "data/tokens_consumed": 27936161792, "data/tokens_consumed_B": 27.936161792, "train/loss_slope": -1.1569477422366911e-05} {"step": 13330, "timestamp": 1778340118.8656147, "train/loss": 2.3199234008789062, "train/z_loss": 0.0013549798051826656, "train/perplexity": 10.174894888216697, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026732.499187046, "perf/iters_per_sec": 0.9664213653502683, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347453355789185, "data/tokens_consumed": 27957133312, "data/tokens_consumed_B": 27.957133312, "train/loss_slope": -1.3957444347492663e-05} {"step": 13340, "timestamp": 1778340129.221309, "train/loss": 2.3517812490463257, "train/z_loss": 0.0013662555953487753, "train/perplexity": 10.504263780241422, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026526.299535528, "perf/iters_per_sec": 0.9663230416944161, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348506212234496, "data/tokens_consumed": 27978104832, "data/tokens_consumed_B": 27.978104832, "train/loss_slope": -1.148142891891582e-05} {"step": 13350, "timestamp": 1778340139.5728621, "grad/layer_0/attn": 0.0028879698365926743, "grad/layer_0/mlp": 0.003425892908126116, "grad/layer_0/attn_mlp_ratio": 0.8429830790811319, "grad/layer_4/attn": 0.002283215755596757, "grad/layer_4/mlp": 0.002600262640044093, "grad/layer_4/attn_mlp_ratio": 0.8780711735145811, "grad/layer_8/attn": 0.006862119305878878, "grad/layer_8/mlp": 0.003394337370991707, "grad/layer_8/attn_mlp_ratio": 2.0216373193658925, "grad/layer_12/attn": 0.007031355984508991, "grad/layer_12/mlp": 0.0065651750192046165, "grad/layer_12/attn_mlp_ratio": 1.0710081386771682, "grad/layer_16/attn": 0.0047662220895290375, "grad/layer_16/mlp": 0.004481631331145763, "grad/layer_16/attn_mlp_ratio": 1.063501575878255, "grad/layer_20/attn": 0.0025700905825942755, "grad/layer_20/mlp": 0.0056363921612501144, "grad/layer_20/attn_mlp_ratio": 0.4559814972892335, "grad/layer_24/attn": 0.00822080671787262, "grad/layer_24/mlp": 0.009245486930012703, "grad/layer_24/attn_mlp_ratio": 0.8891696771826335, "grad/layer_27/attn": 0.0057999673299491405, "grad/layer_27/mlp": 0.008312750607728958, "grad/layer_27/attn_mlp_ratio": 0.6977193872247963} {"step": 13350, "timestamp": 1778340140.1741538, "eos/sharpness": 48.990011215209954, "eos/L0_probe": 2.309699058532715, "eos/L_plus": 2.5245473384857178, "eos/L_minus": 2.5847508907318115, "eos/grad_norm": 0.14997752010822296, "eos/embed_grad_frac": 0.10808452218770981, "eos/time_s": 0.5985229015350342} {"step": 13350, "timestamp": 1778340140.19596, "train/loss": 2.3548837423324587, "train/z_loss": 0.0013601343147456647, "train/perplexity": 10.536903794626864, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911716.2477597175, "perf/iters_per_sec": 0.9115773428724849, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0969996213912965, "data/tokens_consumed": 27999076352, "data/tokens_consumed_B": 27.999076352, "train/loss_slope": -1.0444669053964478e-05} {"step": 13350, "timestamp": 1778340141.5580502, "geo/rankme_last": 430.3369445800781, "geo/layer_0/stable_rank_q_proj": 20.705242156982422, "geo/layer_0/stable_rank_k_proj": 16.908842086791992, "geo/layer_0/stable_rank_o_proj": 43.84653091430664, "geo/layer_0/stable_rank_gate_proj": 124.64071655273438, "geo/layer_0/stable_rank_down_proj": 57.79774475097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06425859779119492, "geo/layer_0/attn_entropy_mean": 6.230649948120117, "geo/layer_0/attn_entropy_std": 0.46079209446907043, "geo/layer_7/stable_rank_q_proj": 41.72455596923828, "geo/layer_7/stable_rank_k_proj": 38.7402229309082, "geo/layer_7/stable_rank_o_proj": 88.51399230957031, "geo/layer_7/stable_rank_gate_proj": 78.24246215820312, "geo/layer_7/stable_rank_down_proj": 144.00579833984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40550199151039124, "geo/layer_7/attn_entropy_mean": 4.7335333824157715, "geo/layer_7/attn_entropy_std": 0.7719008326530457, "geo/layer_14/stable_rank_q_proj": 51.874271392822266, "geo/layer_14/stable_rank_k_proj": 43.73265075683594, "geo/layer_14/stable_rank_o_proj": 42.27439880371094, "geo/layer_14/stable_rank_gate_proj": 71.76454162597656, "geo/layer_14/stable_rank_down_proj": 127.79096221923828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36686885356903076, "geo/layer_14/attn_entropy_mean": 5.502939224243164, "geo/layer_14/attn_entropy_std": 0.49777138233184814, "geo/layer_21/stable_rank_q_proj": 38.35882568359375, "geo/layer_21/stable_rank_k_proj": 28.73467254638672, "geo/layer_21/stable_rank_o_proj": 65.09172058105469, "geo/layer_21/stable_rank_gate_proj": 59.73602294921875, "geo/layer_21/stable_rank_down_proj": 48.702606201171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13885752856731415, "geo/layer_21/attn_entropy_mean": 5.849795341491699, "geo/layer_21/attn_entropy_std": 0.3302530348300934, "geo/layer_27/stable_rank_q_proj": 44.8769645690918, "geo/layer_27/stable_rank_k_proj": 30.42864418029785, "geo/layer_27/stable_rank_o_proj": 107.03258514404297, "geo/layer_27/stable_rank_gate_proj": 69.68498992919922, "geo/layer_27/stable_rank_down_proj": 130.22410583496094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0907919630408287, "geo/layer_27/attn_entropy_mean": 4.325654983520508, "geo/layer_27/attn_entropy_std": 0.6892902255058289, "attnres/final_alpha/block_0": 0.264133095741272, "attnres/block_norm/0": 1.782531976699829, "attnres/final_alpha/block_1": 0.0038009588606655598, "attnres/block_norm/1": 50904.6953125, "attnres/final_alpha/block_2": 0.008221287280321121, "attnres/block_norm/2": 29957.509765625, "attnres/final_alpha/block_3": 0.010635373182594776, "attnres/block_norm/3": 72820.640625, "attnres/final_alpha/block_4": 0.011912008747458458, "attnres/block_norm/4": 17657.0, "attnres/final_alpha/block_5": 0.601517379283905, "attnres/block_norm/5": 7244.708984375, "attnres/final_alpha/block_6": 0.09977990388870239, "attnres/block_norm/6": 48869.43359375, "geo/tier1_time_s": 1.357835054397583, "geo/step": 13350.0, "geo/rankme_slope": 0.00016278222226390556} {"step": 13360, "timestamp": 1778340151.9137447, "train/loss": 2.2862998962402346, "train/z_loss": 0.0013572249561548233, "train/perplexity": 9.838466901575385, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790335.6404607669, "perf/iters_per_sec": 0.8536985590270838, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1713736534118653, "data/tokens_consumed": 28020047872, "data/tokens_consumed_B": 28.020047872, "train/loss_slope": -1.152625803542579e-05} {"step": 13370, "timestamp": 1778340162.2792506, "train/loss": 2.3710838317871095, "train/z_loss": 0.0013579003512859344, "train/perplexity": 10.708992743245348, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024672.9948786166, "perf/iters_per_sec": 0.9654393171685298, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357978820800782, "data/tokens_consumed": 28041019392, "data/tokens_consumed_B": 28.041019392, "train/loss_slope": -7.675156334374334e-06} {"step": 13380, "timestamp": 1778340172.6456592, "train/loss": 2.3537280797958373, "train/z_loss": 0.001373664871789515, "train/perplexity": 10.524733723261688, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024859.8461689083, "perf/iters_per_sec": 0.9655284148067991, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357023000717163, "data/tokens_consumed": 28061990912, "data/tokens_consumed_B": 28.061990912, "train/loss_slope": -5.762177088795053e-06} {"step": 13390, "timestamp": 1778340183.0074377, "train/loss": 2.3172396659851073, "train/z_loss": 0.0013664842816069722, "train/perplexity": 10.147624777006552, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025244.2838068488, "perf/iters_per_sec": 0.965711728957581, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355057001113892, "data/tokens_consumed": 28082962432, "data/tokens_consumed_B": 28.082962432, "train/loss_slope": -5.697931543757152e-06} {"step": 13400, "timestamp": 1778340193.3706706, "grad/layer_0/attn": 0.0030611513648182154, "grad/layer_0/mlp": 0.0033634768333286047, "grad/layer_0/attn_mlp_ratio": 0.9101151651986511, "grad/layer_4/attn": 0.0016605659620836377, "grad/layer_4/mlp": 0.0027055966202169657, "grad/layer_4/attn_mlp_ratio": 0.6137522084039466, "grad/layer_8/attn": 0.0032667559571564198, "grad/layer_8/mlp": 0.0035053433384746313, "grad/layer_8/attn_mlp_ratio": 0.931936061186052, "grad/layer_12/attn": 0.005707748234272003, "grad/layer_12/mlp": 0.007586200721561909, "grad/layer_12/attn_mlp_ratio": 0.7523855970236293, "grad/layer_16/attn": 0.007428726181387901, "grad/layer_16/mlp": 0.004483446944504976, "grad/layer_16/attn_mlp_ratio": 1.656922922841864, "grad/layer_20/attn": 0.0027615090366452932, "grad/layer_20/mlp": 0.005486685317009687, "grad/layer_20/attn_mlp_ratio": 0.5033109840932617, "grad/layer_24/attn": 0.005641858093440533, "grad/layer_24/mlp": 0.007418215274810791, "grad/layer_24/attn_mlp_ratio": 0.7605411555719942, "grad/layer_27/attn": 0.007322121877223253, "grad/layer_27/mlp": 0.006737148389220238, "grad/layer_27/attn_mlp_ratio": 1.086828038440744} {"step": 13400, "timestamp": 1778340193.3862076, "train/loss": 2.316458320617676, "train/z_loss": 0.0013538141967728734, "train/perplexity": 10.139699074155429, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021816.03365171, "perf/iters_per_sec": 0.9640770118959952, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372615337371827, "data/tokens_consumed": 28103933952, "data/tokens_consumed_B": 28.103933952, "train/loss_slope": -8.360628233347983e-06} {"step": 13410, "timestamp": 1778340203.758777, "train/loss": 2.344073939323425, "train/z_loss": 0.0013644684688188136, "train/perplexity": 10.423615356294, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023016.5659771955, "perf/iters_per_sec": 0.9646494703184106, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366459846496583, "data/tokens_consumed": 28124905472, "data/tokens_consumed_B": 28.124905472, "train/loss_slope": -5.673005936419815e-06} {"step": 13420, "timestamp": 1778340214.1209261, "train/loss": 2.3725098848342894, "train/z_loss": 0.0013556775171309709, "train/perplexity": 10.724275229207011, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025250.1125682709, "perf/iters_per_sec": 0.9657145083276133, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355027198791504, "data/tokens_consumed": 28145876992, "data/tokens_consumed_B": 28.145876992, "train/loss_slope": -4.301025929695699e-06} {"step": 13425, "timestamp": 1778340219.8784266, "eos/sharpness": 7.670974731445311, "eos/L0_probe": 2.3117690086364746, "eos/L_plus": 2.346395254135132, "eos/L_minus": 2.3538525104522705, "eos/grad_norm": 0.10928654670715332, "eos/embed_grad_frac": 0.3646398186683655, "eos/time_s": 0.5926146507263184} {"step": 13425, "timestamp": 1778340221.262518, "geo/rankme_last": 428.9168701171875, "geo/layer_0/stable_rank_q_proj": 20.71446990966797, "geo/layer_0/stable_rank_k_proj": 16.93461799621582, "geo/layer_0/stable_rank_o_proj": 43.92976760864258, "geo/layer_0/stable_rank_gate_proj": 124.40668487548828, "geo/layer_0/stable_rank_down_proj": 57.89619445800781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06365790218114853, "geo/layer_0/attn_entropy_mean": 6.234869956970215, "geo/layer_0/attn_entropy_std": 0.4603610932826996, "geo/layer_7/stable_rank_q_proj": 41.69408416748047, "geo/layer_7/stable_rank_k_proj": 38.775489807128906, "geo/layer_7/stable_rank_o_proj": 88.58124542236328, "geo/layer_7/stable_rank_gate_proj": 78.05345153808594, "geo/layer_7/stable_rank_down_proj": 144.12445068359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3931940197944641, "geo/layer_7/attn_entropy_mean": 4.719390869140625, "geo/layer_7/attn_entropy_std": 0.78885817527771, "geo/layer_14/stable_rank_q_proj": 51.841026306152344, "geo/layer_14/stable_rank_k_proj": 43.766563415527344, "geo/layer_14/stable_rank_o_proj": 42.358219146728516, "geo/layer_14/stable_rank_gate_proj": 71.67945098876953, "geo/layer_14/stable_rank_down_proj": 127.35397338867188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36219143867492676, "geo/layer_14/attn_entropy_mean": 5.479449272155762, "geo/layer_14/attn_entropy_std": 0.4876328110694885, "geo/layer_21/stable_rank_q_proj": 38.32607650756836, "geo/layer_21/stable_rank_k_proj": 28.724014282226562, "geo/layer_21/stable_rank_o_proj": 65.0477523803711, "geo/layer_21/stable_rank_gate_proj": 59.71448516845703, "geo/layer_21/stable_rank_down_proj": 48.70671844482422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13114027678966522, "geo/layer_21/attn_entropy_mean": 5.853597640991211, "geo/layer_21/attn_entropy_std": 0.3329225480556488, "geo/layer_27/stable_rank_q_proj": 44.87267303466797, "geo/layer_27/stable_rank_k_proj": 30.486286163330078, "geo/layer_27/stable_rank_o_proj": 107.179931640625, "geo/layer_27/stable_rank_gate_proj": 69.60862731933594, "geo/layer_27/stable_rank_down_proj": 130.4390106201172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0982685387134552, "geo/layer_27/attn_entropy_mean": 4.309221267700195, "geo/layer_27/attn_entropy_std": 0.6889585256576538, "attnres/final_alpha/block_0": 0.26313331723213196, "attnres/block_norm/0": 1.7825796604156494, "attnres/final_alpha/block_1": 0.0038519392255693674, "attnres/block_norm/1": 50830.3046875, "attnres/final_alpha/block_2": 0.008120940066874027, "attnres/block_norm/2": 29968.11328125, "attnres/final_alpha/block_3": 0.010564718395471573, "attnres/block_norm/3": 73207.0, "attnres/final_alpha/block_4": 0.011867748573422432, "attnres/block_norm/4": 17640.80859375, "attnres/final_alpha/block_5": 0.6023741960525513, "attnres/block_norm/5": 7232.833984375, "attnres/final_alpha/block_6": 0.10008719563484192, "attnres/block_norm/6": 48989.32421875, "geo/tier1_time_s": 1.3647966384887695, "geo/step": 13425.0, "geo/rankme_slope": 0.00016969918826905761} {"step": 13430, "timestamp": 1778340226.4440806, "train/loss": 2.324998712539673, "train/z_loss": 0.0013601652230136097, "train/perplexity": 10.226666919354587, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702841.9683165946, "perf/iters_per_sec": 0.8119783250411008, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.23155996799469, "data/tokens_consumed": 28166848512, "data/tokens_consumed_B": 28.166848512, "train/loss_slope": -5.953580253731618e-06} {"step": 13440, "timestamp": 1778340236.7888443, "train/loss": 2.3530462265014647, "train/z_loss": 0.001363147236406803, "train/perplexity": 10.51755984494431, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028136.3876879588, "perf/iters_per_sec": 0.9670907915534777, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340290784835815, "data/tokens_consumed": 28187820032, "data/tokens_consumed_B": 28.187820032, "train/loss_slope": -5.74109643706687e-06} {"step": 13450, "timestamp": 1778340247.1242154, "grad/layer_0/attn": 0.0028112526051700115, "grad/layer_0/mlp": 0.0032214017119258642, "grad/layer_0/attn_mlp_ratio": 0.8726798981618963, "grad/layer_4/attn": 0.0018823541468009353, "grad/layer_4/mlp": 0.0026342461351305246, "grad/layer_4/attn_mlp_ratio": 0.7145703092207955, "grad/layer_8/attn": 0.004839183762669563, "grad/layer_8/mlp": 0.0034054492134600878, "grad/layer_8/attn_mlp_ratio": 1.4210118305219284, "grad/layer_12/attn": 0.006367529276758432, "grad/layer_12/mlp": 0.0071205515414476395, "grad/layer_12/attn_mlp_ratio": 0.8942466254572218, "grad/layer_16/attn": 0.004086977336555719, "grad/layer_16/mlp": 0.004254243336617947, "grad/layer_16/attn_mlp_ratio": 0.960682527327303, "grad/layer_20/attn": 0.0031810288783162832, "grad/layer_20/mlp": 0.005318382289260626, "grad/layer_20/attn_mlp_ratio": 0.5981196246324284, "grad/layer_24/attn": 0.004276123363524675, "grad/layer_24/mlp": 0.007139446213841438, "grad/layer_24/attn_mlp_ratio": 0.5989432759280561, "grad/layer_27/attn": 0.0071115209721028805, "grad/layer_27/mlp": 0.006669950671494007, "grad/layer_27/attn_mlp_ratio": 1.0662029174931926} {"step": 13450, "timestamp": 1778340247.1396427, "train/loss": 2.352622890472412, "train/z_loss": 0.0013610037625767291, "train/perplexity": 10.513108325235049, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027403.0314075197, "perf/iters_per_sec": 0.9667411000287627, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034403109550476, "data/tokens_consumed": 28208791552, "data/tokens_consumed_B": 28.208791552, "train/loss_slope": -4.234721300804465e-06} {"step": 13460, "timestamp": 1778340257.4922814, "train/loss": 2.3308198928833006, "train/z_loss": 0.0013546805013902486, "train/perplexity": 10.286371799646913, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027075.6045362966, "perf/iters_per_sec": 0.9665849707299693, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345701932907105, "data/tokens_consumed": 28229763072, "data/tokens_consumed_B": 28.229763072, "train/loss_slope": -6.370296587001314e-06} {"step": 13470, "timestamp": 1778340267.844396, "train/loss": 2.3082345724105835, "train/z_loss": 0.0013627626583911478, "train/perplexity": 10.056654678199152, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026704.0134946278, "perf/iters_per_sec": 0.9664077823136462, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347598791122437, "data/tokens_consumed": 28250734592, "data/tokens_consumed_B": 28.250734592, "train/loss_slope": -6.15520570287475e-06} {"step": 13480, "timestamp": 1778340278.1961277, "train/loss": 2.3191548109054567, "train/z_loss": 0.0013601002399809658, "train/perplexity": 10.16707757056552, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027246.873722224, "perf/iters_per_sec": 0.9666666382418747, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344827890396118, "data/tokens_consumed": 28271706112, "data/tokens_consumed_B": 28.271706112, "train/loss_slope": -4.701899094442904e-06} {"step": 13490, "timestamp": 1778340288.5471451, "train/loss": 2.3494574546813967, "train/z_loss": 0.0013566516106948256, "train/perplexity": 10.479882370923633, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027286.6818348945, "perf/iters_per_sec": 0.9666856202291939, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344624757766723, "data/tokens_consumed": 28292677632, "data/tokens_consumed_B": 28.292677632, "train/loss_slope": -3.126303655336601e-06} {"step": 13500, "timestamp": 1778340298.8927007, "grad/layer_0/attn": 0.0035237008705735207, "grad/layer_0/mlp": 0.003872527275234461, "grad/layer_0/attn_mlp_ratio": 0.90992277888292, "grad/layer_4/attn": 0.00331299751996994, "grad/layer_4/mlp": 0.0026389281265437603, "grad/layer_4/attn_mlp_ratio": 1.2554329771632393, "grad/layer_8/attn": 0.0035657461266964674, "grad/layer_8/mlp": 0.003562809666618705, "grad/layer_8/attn_mlp_ratio": 1.0008241697621003, "grad/layer_12/attn": 0.008885443210601807, "grad/layer_12/mlp": 0.007102217525243759, "grad/layer_12/attn_mlp_ratio": 1.2510801103897242, "grad/layer_16/attn": 0.0038954177871346474, "grad/layer_16/mlp": 0.004387657158076763, "grad/layer_16/attn_mlp_ratio": 0.8878126886424401, "grad/layer_20/attn": 0.0029951741453260183, "grad/layer_20/mlp": 0.006788549479097128, "grad/layer_20/attn_mlp_ratio": 0.4412097327164803, "grad/layer_24/attn": 0.016407348215579987, "grad/layer_24/mlp": 0.014036901295185089, "grad/layer_24/attn_mlp_ratio": 1.168872513502731, "grad/layer_27/attn": 0.005851675756275654, "grad/layer_27/mlp": 0.014248192310333252, "grad/layer_27/attn_mlp_ratio": 0.4106960088517494} {"step": 13500, "timestamp": 1778340299.4901655, "eos/sharpness": 60.02912521362303, "eos/L0_probe": 2.3120296001434326, "eos/L_plus": 2.654387950897217, "eos/L_minus": 2.569962501525879, "eos/grad_norm": 0.247029110789299, "eos/embed_grad_frac": 0.041138846427202225, "eos/time_s": 0.5946078300476074} {"step": 13500, "timestamp": 1778340299.5118828, "train/loss": 2.320285701751709, "train/z_loss": 0.0013671269291080535, "train/perplexity": 10.178581929384158, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913440.5337877048, "perf/iters_per_sec": 0.9123995465219997, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0960110664367675, "data/tokens_consumed": 28313649152, "data/tokens_consumed_B": 28.313649152, "train/loss_slope": -4.660763942261895e-06} {"step": 13500, "timestamp": 1778340300.8789413, "geo/rankme_last": 429.948486328125, "geo/layer_0/stable_rank_q_proj": 20.71225357055664, "geo/layer_0/stable_rank_k_proj": 16.892759323120117, "geo/layer_0/stable_rank_o_proj": 43.95206069946289, "geo/layer_0/stable_rank_gate_proj": 124.62715911865234, "geo/layer_0/stable_rank_down_proj": 57.89862823486328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06519634276628494, "geo/layer_0/attn_entropy_mean": 6.233433723449707, "geo/layer_0/attn_entropy_std": 0.46215254068374634, "geo/layer_7/stable_rank_q_proj": 41.757286071777344, "geo/layer_7/stable_rank_k_proj": 38.783775329589844, "geo/layer_7/stable_rank_o_proj": 88.60016632080078, "geo/layer_7/stable_rank_gate_proj": 78.12358856201172, "geo/layer_7/stable_rank_down_proj": 144.06666564941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39829787611961365, "geo/layer_7/attn_entropy_mean": 4.7291259765625, "geo/layer_7/attn_entropy_std": 0.7753444910049438, "geo/layer_14/stable_rank_q_proj": 51.88308334350586, "geo/layer_14/stable_rank_k_proj": 43.87718963623047, "geo/layer_14/stable_rank_o_proj": 42.37544250488281, "geo/layer_14/stable_rank_gate_proj": 71.72500610351562, "geo/layer_14/stable_rank_down_proj": 127.21602630615234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3738167881965637, "geo/layer_14/attn_entropy_mean": 5.509871959686279, "geo/layer_14/attn_entropy_std": 0.48285141587257385, "geo/layer_21/stable_rank_q_proj": 38.24639129638672, "geo/layer_21/stable_rank_k_proj": 28.731863021850586, "geo/layer_21/stable_rank_o_proj": 65.02652740478516, "geo/layer_21/stable_rank_gate_proj": 59.711708068847656, "geo/layer_21/stable_rank_down_proj": 48.66465759277344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.133419468998909, "geo/layer_21/attn_entropy_mean": 5.832633018493652, "geo/layer_21/attn_entropy_std": 0.3329470157623291, "geo/layer_27/stable_rank_q_proj": 44.86692428588867, "geo/layer_27/stable_rank_k_proj": 30.464941024780273, "geo/layer_27/stable_rank_o_proj": 107.079345703125, "geo/layer_27/stable_rank_gate_proj": 69.52884674072266, "geo/layer_27/stable_rank_down_proj": 130.4278564453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10376793146133423, "geo/layer_27/attn_entropy_mean": 4.315225124359131, "geo/layer_27/attn_entropy_std": 0.6824504733085632, "attnres/final_alpha/block_0": 0.26182764768600464, "attnres/block_norm/0": 1.7826299667358398, "attnres/final_alpha/block_1": 0.003841768717393279, "attnres/block_norm/1": 50834.734375, "attnres/final_alpha/block_2": 0.007965571247041225, "attnres/block_norm/2": 29940.11328125, "attnres/final_alpha/block_3": 0.010433495044708252, "attnres/block_norm/3": 73032.8359375, "attnres/final_alpha/block_4": 0.01169792003929615, "attnres/block_norm/4": 17574.791015625, "attnres/final_alpha/block_5": 0.6059062480926514, "attnres/block_norm/5": 7225.74072265625, "attnres/final_alpha/block_6": 0.09832733869552612, "attnres/block_norm/6": 49354.03515625, "geo/tier1_time_s": 1.3636226654052734, "geo/step": 13500.0, "geo/rankme_slope": 0.00016284965157938174} {"step": 13500, "timestamp": 1778340307.7507198, "geo/ww_alpha_mean": 7.577516850913027, "geo/ww_alpha_std": 4.528502692011403, "geo/ww_alpha_min": 1.348520494140291, "geo/ww_alpha_max": 33.69522189776689, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.8581124743759636, "geo/ww_alpha_by_type/k_proj": 4.441030557566956, "geo/ww_alpha_by_type/v_proj": 10.52557208396348, "geo/ww_alpha_by_type/o_proj": 8.051974798305087, "geo/ww_alpha_by_type/gate_proj": 7.485148605830921, "geo/ww_alpha_by_type/up_proj": 10.83949955573266, "geo/ww_alpha_by_type/down_proj": 7.937334471831654, "geo/twonn_id/layer_0": 0.745969831943512, "geo/twonn_id/layer_7": 3.939574956893921, "geo/twonn_id/layer_14": 5.062496185302734, "geo/twonn_id/layer_21": 8.311570167541504, "geo/twonn_id/layer_27": 6.350898265838623, "geo/tier2_time_s": 6.865547180175781} {"step": 13500, "timestamp": 1778340308.4916997, "eoc/jacobian_sigma/layer_0/attn": 1508.9100341796875, "eoc/jacobian_sigma/layer_0/mlp": 10968.6259765625, "eoc/jacobian_sigma/layer_0": 10968.6259765625, "eoc/jacobian_sigma/layer_7/attn": 1.1505223512649536, "eoc/jacobian_sigma/layer_7/mlp": 1.7927234172821045, "eoc/jacobian_sigma/layer_7": 1.7927234172821045, "eoc/jacobian_sigma/layer_14/attn": 2.076172351837158, "eoc/jacobian_sigma/layer_14/mlp": 9.103830337524414, "eoc/jacobian_sigma/layer_14": 9.103830337524414, "eoc/jacobian_sigma/layer_21/attn": 1.0927205085754395, "eoc/jacobian_sigma/layer_21/mlp": 5.559336185455322, "eoc/jacobian_sigma/layer_21": 5.559336185455322, "eoc/jacobian_sigma/layer_27/attn": 3.8590846061706543, "eoc/jacobian_sigma/layer_27/mlp": 35.198299407958984, "eoc/jacobian_sigma/layer_27": 35.198299407958984, "eoc/layer0_sigma": 10968.6259765625, "eoc/sigma_max": 35.198299407958984, "eoc/sigma_min": 1.7927234172821045, "eoc/sigma_mean": 12.913547337055206, "eoc/time_s": 0.7343606948852539} {"step": 13510, "timestamp": 1778340318.8597465, "train/loss": 2.337486982345581, "train/z_loss": 0.0013701295130886138, "train/perplexity": 10.355181084625492, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1084105.659720802, "perf/iters_per_sec": 0.5169418619731913, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.934453511238098, "data/tokens_consumed": 28334620672, "data/tokens_consumed_B": 28.334620672, "train/loss_slope": -3.783183079240895e-06} {"step": 13520, "timestamp": 1778340329.2096834, "train/loss": 2.2716920614242553, "train/z_loss": 0.001400416367687285, "train/perplexity": 9.695792819028908, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027216.8785011384, "perf/iters_per_sec": 0.9666523354058926, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344980955123901, "data/tokens_consumed": 28355592192, "data/tokens_consumed_B": 28.355592192, "train/loss_slope": -9.027929403314836e-06} {"step": 13530, "timestamp": 1778340339.5553522, "train/loss": 2.2987277507781982, "train/z_loss": 0.0013671987690031528, "train/perplexity": 9.96150087772232, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028623.12264297, "perf/iters_per_sec": 0.9673228848662233, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337809801101685, "data/tokens_consumed": 28376563712, "data/tokens_consumed_B": 28.376563712, "train/loss_slope": -1.1392147334316609e-05} {"step": 13540, "timestamp": 1778340349.9008062, "train/loss": 2.3185362339019777, "train/z_loss": 0.001360555412247777, "train/perplexity": 10.160790394939387, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028079.6187158413, "perf/iters_per_sec": 0.9670637219981391, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340580224990845, "data/tokens_consumed": 28397535232, "data/tokens_consumed_B": 28.397535232, "train/loss_slope": -9.898174885619086e-06} {"step": 13550, "timestamp": 1778340360.267543, "grad/layer_0/attn": 0.0027620040345937014, "grad/layer_0/mlp": 0.0029898809734731913, "grad/layer_0/attn_mlp_ratio": 0.9237839120421011, "grad/layer_4/attn": 0.0016984074609354138, "grad/layer_4/mlp": 0.002572170225903392, "grad/layer_4/attn_mlp_ratio": 0.6603013197965042, "grad/layer_8/attn": 0.005892032291740179, "grad/layer_8/mlp": 0.0034768525511026382, "grad/layer_8/attn_mlp_ratio": 1.6946453827635168, "grad/layer_12/attn": 0.010675756260752678, "grad/layer_12/mlp": 0.0072471387684345245, "grad/layer_12/attn_mlp_ratio": 1.473099447183462, "grad/layer_16/attn": 0.0044328197836875916, "grad/layer_16/mlp": 0.004864913877099752, "grad/layer_16/attn_mlp_ratio": 0.9111815346692408, "grad/layer_20/attn": 0.00662707444280386, "grad/layer_20/mlp": 0.007022257428616285, "grad/layer_20/attn_mlp_ratio": 0.9437242105972301, "grad/layer_24/attn": 0.01819508709013462, "grad/layer_24/mlp": 0.016279228031635284, "grad/layer_24/attn_mlp_ratio": 1.1176873340067188, "grad/layer_27/attn": 0.013408826664090157, "grad/layer_27/mlp": 0.016676435247063637, "grad/layer_27/attn_mlp_ratio": 0.8040583245178451} {"step": 13550, "timestamp": 1778340360.2832003, "train/loss": 2.331881356239319, "train/z_loss": 0.001354783633723855, "train/perplexity": 10.297296203279977, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020913.8992967382, "perf/iters_per_sec": 0.963646840713853, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0377245664596557, "data/tokens_consumed": 28418506752, "data/tokens_consumed_B": 28.418506752, "train/loss_slope": -8.64444072275881e-06} {"step": 13560, "timestamp": 1778340370.652854, "train/loss": 2.308740258216858, "train/z_loss": 0.0013612765935249627, "train/perplexity": 10.061741471779778, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023966.0313724764, "perf/iters_per_sec": 0.9651022106993086, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361596822738648, "data/tokens_consumed": 28439478272, "data/tokens_consumed_B": 28.439478272, "train/loss_slope": -8.79822764495383e-06} {"step": 13570, "timestamp": 1778340381.0150495, "train/loss": 2.3785480260849, "train/z_loss": 0.0013665206031873822, "train/perplexity": 10.789225810907901, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025143.5681082124, "perf/iters_per_sec": 0.9656637039700567, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355571985244751, "data/tokens_consumed": 28460449792, "data/tokens_consumed_B": 28.460449792, "train/loss_slope": -4.117912635265306e-06} {"step": 13575, "timestamp": 1778340386.8089728, "eos/sharpness": 6.233239173889159, "eos/L0_probe": 2.3070359230041504, "eos/L_plus": 2.3394641876220703, "eos/L_minus": 2.336940050125122, "eos/grad_norm": 0.09023377299308777, "eos/embed_grad_frac": 0.278352826833725, "eos/time_s": 0.6109414100646973} {"step": 13575, "timestamp": 1778340388.1933966, "geo/rankme_last": 430.0766296386719, "geo/layer_0/stable_rank_q_proj": 20.739595413208008, "geo/layer_0/stable_rank_k_proj": 16.898048400878906, "geo/layer_0/stable_rank_o_proj": 43.96694564819336, "geo/layer_0/stable_rank_gate_proj": 124.6255874633789, "geo/layer_0/stable_rank_down_proj": 57.797176361083984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06723514944314957, "geo/layer_0/attn_entropy_mean": 6.230875492095947, "geo/layer_0/attn_entropy_std": 0.46611568331718445, "geo/layer_7/stable_rank_q_proj": 41.83084487915039, "geo/layer_7/stable_rank_k_proj": 38.69148254394531, "geo/layer_7/stable_rank_o_proj": 88.69618225097656, "geo/layer_7/stable_rank_gate_proj": 78.08861541748047, "geo/layer_7/stable_rank_down_proj": 144.27960205078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3983232080936432, "geo/layer_7/attn_entropy_mean": 4.710309982299805, "geo/layer_7/attn_entropy_std": 0.7780773043632507, "geo/layer_14/stable_rank_q_proj": 51.93212890625, "geo/layer_14/stable_rank_k_proj": 43.902835845947266, "geo/layer_14/stable_rank_o_proj": 42.311580657958984, "geo/layer_14/stable_rank_gate_proj": 71.8376235961914, "geo/layer_14/stable_rank_down_proj": 127.37698364257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3621613681316376, "geo/layer_14/attn_entropy_mean": 5.515232086181641, "geo/layer_14/attn_entropy_std": 0.47989630699157715, "geo/layer_21/stable_rank_q_proj": 38.24933624267578, "geo/layer_21/stable_rank_k_proj": 28.600021362304688, "geo/layer_21/stable_rank_o_proj": 65.04827117919922, "geo/layer_21/stable_rank_gate_proj": 59.68647384643555, "geo/layer_21/stable_rank_down_proj": 48.62144470214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13534852862358093, "geo/layer_21/attn_entropy_mean": 5.830231189727783, "geo/layer_21/attn_entropy_std": 0.3360888361930847, "geo/layer_27/stable_rank_q_proj": 44.887325286865234, "geo/layer_27/stable_rank_k_proj": 30.430377960205078, "geo/layer_27/stable_rank_o_proj": 106.94850158691406, "geo/layer_27/stable_rank_gate_proj": 69.56007385253906, "geo/layer_27/stable_rank_down_proj": 130.43724060058594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08777843415737152, "geo/layer_27/attn_entropy_mean": 4.29937744140625, "geo/layer_27/attn_entropy_std": 0.6972674131393433, "attnres/final_alpha/block_0": 0.26338905096054077, "attnres/block_norm/0": 1.7828980684280396, "attnres/final_alpha/block_1": 0.0038135694339871407, "attnres/block_norm/1": 50913.25, "attnres/final_alpha/block_2": 0.008050906471908092, "attnres/block_norm/2": 29923.9296875, "attnres/final_alpha/block_3": 0.010399481281638145, "attnres/block_norm/3": 72762.8046875, "attnres/final_alpha/block_4": 0.011820906773209572, "attnres/block_norm/4": 17674.888671875, "attnres/final_alpha/block_5": 0.6025381088256836, "attnres/block_norm/5": 7321.5478515625, "attnres/final_alpha/block_6": 0.09998804330825806, "attnres/block_norm/6": 49083.1875, "geo/tier1_time_s": 1.3641688823699951, "geo/step": 13575.0, "geo/rankme_slope": 0.0001846537638492897} {"step": 13580, "timestamp": 1778340393.4051378, "train/loss": 2.3057268619537354, "train/z_loss": 0.0013713631662540138, "train/perplexity": 10.031467094884329, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693580.73307126, "perf/iters_per_sec": 0.8075622239452648, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2382946729660034, "data/tokens_consumed": 28481421312, "data/tokens_consumed_B": 28.481421312, "train/loss_slope": -1.6153724423192984e-06} {"step": 13590, "timestamp": 1778340403.7605321, "train/loss": 2.3331782579422, "train/z_loss": 0.0013852215837687255, "train/perplexity": 10.310659447795333, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027145.4448587701, "perf/iters_per_sec": 0.9666182731908656, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345345497131349, "data/tokens_consumed": 28502392832, "data/tokens_consumed_B": 28.502392832, "train/loss_slope": -3.6430809113702324e-06} {"step": 13600, "timestamp": 1778340414.1029668, "grad/layer_0/attn": 0.003245972329750657, "grad/layer_0/mlp": 0.003394865896552801, "grad/layer_0/attn_mlp_ratio": 0.9561415187069732, "grad/layer_4/attn": 0.001751953735947609, "grad/layer_4/mlp": 0.00272519513964653, "grad/layer_4/attn_mlp_ratio": 0.6428727419085192, "grad/layer_8/attn": 0.002951935166493058, "grad/layer_8/mlp": 0.0034762579016387463, "grad/layer_8/attn_mlp_ratio": 0.8491703334739491, "grad/layer_12/attn": 0.00615066010504961, "grad/layer_12/mlp": 0.007628979627043009, "grad/layer_12/attn_mlp_ratio": 0.8062231550107428, "grad/layer_16/attn": 0.0037304481957107782, "grad/layer_16/mlp": 0.005100390408188105, "grad/layer_16/attn_mlp_ratio": 0.7314044267241817, "grad/layer_20/attn": 0.0031256715301424265, "grad/layer_20/mlp": 0.006085098721086979, "grad/layer_20/attn_mlp_ratio": 0.5136599457203374, "grad/layer_24/attn": 0.006496841087937355, "grad/layer_24/mlp": 0.008173485286533833, "grad/layer_24/attn_mlp_ratio": 0.7948678905869434, "grad/layer_27/attn": 0.004655465018004179, "grad/layer_27/mlp": 0.008720988407731056, "grad/layer_27/attn_mlp_ratio": 0.5338230882745876} {"step": 13600, "timestamp": 1778340414.1184886, "train/loss": 2.346074628829956, "train/z_loss": 0.0013568773865699768, "train/perplexity": 10.444490649684438, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025703.6464232937, "perf/iters_per_sec": 0.9659307701221913, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035270881652832, "data/tokens_consumed": 28523364352, "data/tokens_consumed_B": 28.523364352, "train/loss_slope": 5.066420414624602e-07} {"step": 13610, "timestamp": 1778340424.8802245, "train/loss": 2.3581844091415407, "train/z_loss": 0.0013520659646019339, "train/perplexity": 10.571740063083118, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949851.7924831659, "perf/iters_per_sec": 0.9297617876449422, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0755443096160888, "data/tokens_consumed": 28544335872, "data/tokens_consumed_B": 28.544335872, "train/loss_slope": 2.350333886499952e-06} {"step": 13620, "timestamp": 1778340435.2366915, "train/loss": 2.271890568733215, "train/z_loss": 0.0013666537357494234, "train/perplexity": 9.69771769581438, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026668.1973751192, "perf/iters_per_sec": 0.9663907038570019, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347781658172608, "data/tokens_consumed": 28565307392, "data/tokens_consumed_B": 28.565307392, "train/loss_slope": -9.672044455876855e-08} {"step": 13630, "timestamp": 1778340445.5925522, "train/loss": 2.3481624841690065, "train/z_loss": 0.0013713698834180831, "train/perplexity": 10.466320015600363, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026043.4634722346, "perf/iters_per_sec": 0.9660928075181172, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350972414016724, "data/tokens_consumed": 28586278912, "data/tokens_consumed_B": 28.586278912, "train/loss_slope": 3.3173767253511057e-06} {"step": 13640, "timestamp": 1778340456.5399563, "train/loss": 2.336556005477905, "train/z_loss": 0.0013665940263308586, "train/perplexity": 10.345545136693392, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917016.0557567682, "perf/iters_per_sec": 0.9141044882568208, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0939668416976929, "data/tokens_consumed": 28607250432, "data/tokens_consumed_B": 28.607250432, "train/loss_slope": 4.754586968019944e-06} {"step": 13650, "timestamp": 1778340466.8900437, "grad/layer_0/attn": 0.003716823412105441, "grad/layer_0/mlp": 0.0036007293965667486, "grad/layer_0/attn_mlp_ratio": 1.0322417764648488, "grad/layer_4/attn": 0.0021064570173621178, "grad/layer_4/mlp": 0.0026292570400983095, "grad/layer_4/attn_mlp_ratio": 0.801160519919081, "grad/layer_8/attn": 0.003930769860744476, "grad/layer_8/mlp": 0.003734580473974347, "grad/layer_8/attn_mlp_ratio": 1.0525331514166112, "grad/layer_12/attn": 0.007513058837503195, "grad/layer_12/mlp": 0.007341559510678053, "grad/layer_12/attn_mlp_ratio": 1.023360053710618, "grad/layer_16/attn": 0.004109138622879982, "grad/layer_16/mlp": 0.004932568874210119, "grad/layer_16/attn_mlp_ratio": 0.8330625774043033, "grad/layer_20/attn": 0.004098221193999052, "grad/layer_20/mlp": 0.005983003415167332, "grad/layer_20/attn_mlp_ratio": 0.684977233192288, "grad/layer_24/attn": 0.008441204205155373, "grad/layer_24/mlp": 0.00950017012655735, "grad/layer_24/attn_mlp_ratio": 0.8885318898348075, "grad/layer_27/attn": 0.007700513582676649, "grad/layer_27/mlp": 0.008848937228322029, "grad/layer_27/attn_mlp_ratio": 0.8702190214445616} {"step": 13650, "timestamp": 1778340467.504716, "eos/sharpness": 47.87509441375732, "eos/L0_probe": 2.31111216545105, "eos/L_plus": 2.523956060409546, "eos/L_minus": 2.577019214630127, "eos/grad_norm": 0.1450352519750595, "eos/embed_grad_frac": 0.10185505449771881, "eos/time_s": 0.6116461753845215} {"step": 13650, "timestamp": 1778340467.5328157, "train/loss": 2.3596979141235352, "train/z_loss": 0.001361657481174916, "train/perplexity": 10.5877525587763, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909267.4772548652, "perf/iters_per_sec": 0.9104096781038595, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0984066009521485, "data/tokens_consumed": 28628221952, "data/tokens_consumed_B": 28.628221952, "train/loss_slope": 5.807198842938836e-06} {"step": 13650, "timestamp": 1778340468.900342, "geo/rankme_last": 429.5169372558594, "geo/layer_0/stable_rank_q_proj": 20.721050262451172, "geo/layer_0/stable_rank_k_proj": 16.874977111816406, "geo/layer_0/stable_rank_o_proj": 43.92817687988281, "geo/layer_0/stable_rank_gate_proj": 124.45704650878906, "geo/layer_0/stable_rank_down_proj": 57.77374267578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06908270716667175, "geo/layer_0/attn_entropy_mean": 6.231940269470215, "geo/layer_0/attn_entropy_std": 0.4640001356601715, "geo/layer_7/stable_rank_q_proj": 41.86536407470703, "geo/layer_7/stable_rank_k_proj": 38.75517654418945, "geo/layer_7/stable_rank_o_proj": 88.6789321899414, "geo/layer_7/stable_rank_gate_proj": 78.05484771728516, "geo/layer_7/stable_rank_down_proj": 144.46571350097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38830506801605225, "geo/layer_7/attn_entropy_mean": 4.740569114685059, "geo/layer_7/attn_entropy_std": 0.7473594546318054, "geo/layer_14/stable_rank_q_proj": 51.870361328125, "geo/layer_14/stable_rank_k_proj": 43.926918029785156, "geo/layer_14/stable_rank_o_proj": 42.31117248535156, "geo/layer_14/stable_rank_gate_proj": 72.02397918701172, "geo/layer_14/stable_rank_down_proj": 127.17191314697266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38093283772468567, "geo/layer_14/attn_entropy_mean": 5.581306457519531, "geo/layer_14/attn_entropy_std": 0.46403947472572327, "geo/layer_21/stable_rank_q_proj": 38.285911560058594, "geo/layer_21/stable_rank_k_proj": 28.652006149291992, "geo/layer_21/stable_rank_o_proj": 65.05690002441406, "geo/layer_21/stable_rank_gate_proj": 59.76059341430664, "geo/layer_21/stable_rank_down_proj": 48.64071273803711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13714277744293213, "geo/layer_21/attn_entropy_mean": 5.8582763671875, "geo/layer_21/attn_entropy_std": 0.3262059986591339, "geo/layer_27/stable_rank_q_proj": 44.87407684326172, "geo/layer_27/stable_rank_k_proj": 30.48505210876465, "geo/layer_27/stable_rank_o_proj": 106.80782318115234, "geo/layer_27/stable_rank_gate_proj": 69.51481628417969, "geo/layer_27/stable_rank_down_proj": 130.3868865966797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10330633819103241, "geo/layer_27/attn_entropy_mean": 4.309890270233154, "geo/layer_27/attn_entropy_std": 0.6636227369308472, "attnres/final_alpha/block_0": 0.26397228240966797, "attnres/block_norm/0": 1.7828474044799805, "attnres/final_alpha/block_1": 0.003863839665427804, "attnres/block_norm/1": 50703.609375, "attnres/final_alpha/block_2": 0.008026095107197762, "attnres/block_norm/2": 29790.28125, "attnres/final_alpha/block_3": 0.010355101898312569, "attnres/block_norm/3": 73172.6953125, "attnres/final_alpha/block_4": 0.011808818206191063, "attnres/block_norm/4": 17614.291015625, "attnres/final_alpha/block_5": 0.6019816398620605, "attnres/block_norm/5": 7273.55859375, "attnres/final_alpha/block_6": 0.09999227523803711, "attnres/block_norm/6": 49278.828125, "geo/tier1_time_s": 1.3633112907409668, "geo/step": 13650.0, "geo/rankme_slope": 0.00017127440820078032} {"step": 13660, "timestamp": 1778340479.2514262, "train/loss": 2.3007917404174805, "train/z_loss": 0.0013726784847676754, "train/perplexity": 9.982082545193267, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790206.1054150374, "perf/iters_per_sec": 0.8536367919039904, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1714584112167359, "data/tokens_consumed": 28649193472, "data/tokens_consumed_B": 28.649193472, "train/loss_slope": 1.5765270384231746e-06} {"step": 13670, "timestamp": 1778340489.6115458, "train/loss": 2.3587130069732667, "train/z_loss": 0.0013598326593637466, "train/perplexity": 10.577329739173212, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025667.91230139, "perf/iters_per_sec": 0.9659137307650518, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352891445159913, "data/tokens_consumed": 28670164992, "data/tokens_consumed_B": 28.670164992, "train/loss_slope": 5.4375366945722075e-06} {"step": 13680, "timestamp": 1778340499.9680254, "train/loss": 2.2753440618515013, "train/z_loss": 0.0013689226703718304, "train/perplexity": 9.731266594241152, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025993.811250453, "perf/iters_per_sec": 0.9660691314937845, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351226091384889, "data/tokens_consumed": 28691136512, "data/tokens_consumed_B": 28.691136512, "train/loss_slope": 1.6631758180856961e-07} {"step": 13690, "timestamp": 1778340510.3205063, "train/loss": 2.3798150300979612, "train/z_loss": 0.001363371149636805, "train/perplexity": 10.802904466934326, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026784.80287111, "perf/iters_per_sec": 0.966446305690341, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034718632698059, "data/tokens_consumed": 28712108032, "data/tokens_consumed_B": 28.712108032, "train/loss_slope": 1.5968971698805823e-06} {"step": 13700, "timestamp": 1778340520.6573622, "grad/layer_0/attn": 0.004045197274535894, "grad/layer_0/mlp": 0.004329511430114508, "grad/layer_0/attn_mlp_ratio": 0.9343311009565337, "grad/layer_4/attn": 0.0025368938222527504, "grad/layer_4/mlp": 0.002889571012929082, "grad/layer_4/attn_mlp_ratio": 0.877948222454774, "grad/layer_8/attn": 0.004649018403142691, "grad/layer_8/mlp": 0.0038643244188278913, "grad/layer_8/attn_mlp_ratio": 1.203061073285017, "grad/layer_12/attn": 0.010421366430819035, "grad/layer_12/mlp": 0.007481440901756287, "grad/layer_12/attn_mlp_ratio": 1.392962455812001, "grad/layer_16/attn": 0.003978880122303963, "grad/layer_16/mlp": 0.004428749904036522, "grad/layer_16/attn_mlp_ratio": 0.8984205743556237, "grad/layer_20/attn": 0.004410776309669018, "grad/layer_20/mlp": 0.006287638563662767, "grad/layer_20/attn_mlp_ratio": 0.7014996480570002, "grad/layer_24/attn": 0.00963135901838541, "grad/layer_24/mlp": 0.009547412395477295, "grad/layer_24/attn_mlp_ratio": 1.0087925941136284, "grad/layer_27/attn": 0.005426095798611641, "grad/layer_27/mlp": 0.009141497313976288, "grad/layer_27/attn_mlp_ratio": 0.5935675035378527} {"step": 13700, "timestamp": 1778340520.6737883, "train/loss": 2.33599054813385, "train/z_loss": 0.0013703966978937387, "train/perplexity": 10.339696825858582, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026618.9347677294, "perf/iters_per_sec": 0.9663672136152884, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034803318977356, "data/tokens_consumed": 28733079552, "data/tokens_consumed_B": 28.733079552, "train/loss_slope": 1.8582034747664532e-06} {"step": 13710, "timestamp": 1778340531.0246968, "train/loss": 2.3222752094268797, "train/z_loss": 0.0013627379550598562, "train/perplexity": 10.198852453750794, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027076.1183940049, "perf/iters_per_sec": 0.9665852157564186, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345699310302734, "data/tokens_consumed": 28754051072, "data/tokens_consumed_B": 28.754051072, "train/loss_slope": 2.7880511983464914e-06} {"step": 13720, "timestamp": 1778340541.3776681, "train/loss": 2.3138842582702637, "train/z_loss": 0.001365893753245473, "train/perplexity": 10.113632419544329, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026502.0215406409, "perf/iters_per_sec": 0.9663114650443272, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034863018989563, "data/tokens_consumed": 28775022592, "data/tokens_consumed_B": 28.775022592, "train/loss_slope": 2.18910306844134e-06} {"step": 13725, "timestamp": 1778340547.1530006, "eos/sharpness": 64.62528705596922, "eos/L0_probe": 2.310932159423828, "eos/L_plus": 2.6863083839416504, "eos/L_minus": 2.5818088054656982, "eos/grad_norm": 0.22815226018428802, "eos/embed_grad_frac": 0.04648616164922714, "eos/time_s": 0.6062135696411133} {"step": 13725, "timestamp": 1778340548.5360901, "geo/rankme_last": 430.6832580566406, "geo/layer_0/stable_rank_q_proj": 20.750118255615234, "geo/layer_0/stable_rank_k_proj": 16.88785171508789, "geo/layer_0/stable_rank_o_proj": 43.93743133544922, "geo/layer_0/stable_rank_gate_proj": 124.35175323486328, "geo/layer_0/stable_rank_down_proj": 57.782100677490234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06628003716468811, "geo/layer_0/attn_entropy_mean": 6.225583076477051, "geo/layer_0/attn_entropy_std": 0.46318748593330383, "geo/layer_7/stable_rank_q_proj": 41.856693267822266, "geo/layer_7/stable_rank_k_proj": 38.77177047729492, "geo/layer_7/stable_rank_o_proj": 88.70409393310547, "geo/layer_7/stable_rank_gate_proj": 77.93803405761719, "geo/layer_7/stable_rank_down_proj": 144.2561798095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39450258016586304, "geo/layer_7/attn_entropy_mean": 4.726343154907227, "geo/layer_7/attn_entropy_std": 0.7727981805801392, "geo/layer_14/stable_rank_q_proj": 51.74990463256836, "geo/layer_14/stable_rank_k_proj": 43.90672302246094, "geo/layer_14/stable_rank_o_proj": 42.316383361816406, "geo/layer_14/stable_rank_gate_proj": 71.9571304321289, "geo/layer_14/stable_rank_down_proj": 127.21963500976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3808131515979767, "geo/layer_14/attn_entropy_mean": 5.554512023925781, "geo/layer_14/attn_entropy_std": 0.44618019461631775, "geo/layer_21/stable_rank_q_proj": 38.261417388916016, "geo/layer_21/stable_rank_k_proj": 28.611406326293945, "geo/layer_21/stable_rank_o_proj": 65.07974243164062, "geo/layer_21/stable_rank_gate_proj": 59.696903228759766, "geo/layer_21/stable_rank_down_proj": 48.64217758178711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1340107023715973, "geo/layer_21/attn_entropy_mean": 5.850545406341553, "geo/layer_21/attn_entropy_std": 0.32827243208885193, "geo/layer_27/stable_rank_q_proj": 44.84315872192383, "geo/layer_27/stable_rank_k_proj": 30.565975189208984, "geo/layer_27/stable_rank_o_proj": 106.697021484375, "geo/layer_27/stable_rank_gate_proj": 69.5255355834961, "geo/layer_27/stable_rank_down_proj": 130.24441528320312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09916334599256516, "geo/layer_27/attn_entropy_mean": 4.331358432769775, "geo/layer_27/attn_entropy_std": 0.6732913851737976, "attnres/final_alpha/block_0": 0.26180118322372437, "attnres/block_norm/0": 1.782799482345581, "attnres/final_alpha/block_1": 0.003781330306082964, "attnres/block_norm/1": 50906.34375, "attnres/final_alpha/block_2": 0.00792443286627531, "attnres/block_norm/2": 29991.373046875, "attnres/final_alpha/block_3": 0.010258663445711136, "attnres/block_norm/3": 73181.2109375, "attnres/final_alpha/block_4": 0.011538185179233551, "attnres/block_norm/4": 17693.05859375, "attnres/final_alpha/block_5": 0.6061895489692688, "attnres/block_norm/5": 7250.0126953125, "attnres/final_alpha/block_6": 0.09850664436817169, "attnres/block_norm/6": 49040.609375, "geo/tier1_time_s": 1.3636572360992432, "geo/step": 13725.0, "geo/rankme_slope": 0.00017376937102966185} {"step": 13730, "timestamp": 1778340553.7138968, "train/loss": 2.3170837640762327, "train/z_loss": 0.0013651526300236584, "train/perplexity": 10.146042866247932, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701010.3040003565, "perf/iters_per_sec": 0.8111049194337637, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2328861236572266, "data/tokens_consumed": 28795994112, "data/tokens_consumed_B": 28.795994112, "train/loss_slope": 1.8130516312053113e-06} {"step": 13740, "timestamp": 1778340564.0709138, "train/loss": 2.2856724500656127, "train/z_loss": 0.0013674700516276062, "train/perplexity": 9.832295729395497, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025714.7027973337, "perf/iters_per_sec": 0.9659360422121686, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352652311325072, "data/tokens_consumed": 28816965632, "data/tokens_consumed_B": 28.816965632, "train/loss_slope": 1.5589470123694062e-06} {"step": 13750, "timestamp": 1778340574.4187663, "grad/layer_0/attn": 0.0029668258503079414, "grad/layer_0/mlp": 0.003159327432513237, "grad/layer_0/attn_mlp_ratio": 0.9390687795981187, "grad/layer_4/attn": 0.001965408446267247, "grad/layer_4/mlp": 0.0025104968808591366, "grad/layer_4/attn_mlp_ratio": 0.7828762437287017, "grad/layer_8/attn": 0.0039454614743590355, "grad/layer_8/mlp": 0.00338456267490983, "grad/layer_8/attn_mlp_ratio": 1.1657226462476082, "grad/layer_12/attn": 0.005472466815263033, "grad/layer_12/mlp": 0.007515984121710062, "grad/layer_12/attn_mlp_ratio": 0.7281104714743426, "grad/layer_16/attn": 0.004785339348018169, "grad/layer_16/mlp": 0.004661469254642725, "grad/layer_16/attn_mlp_ratio": 1.0265731648009384, "grad/layer_20/attn": 0.0029836527537554502, "grad/layer_20/mlp": 0.005928911734372377, "grad/layer_20/attn_mlp_ratio": 0.5032378347166456, "grad/layer_24/attn": 0.005733804777264595, "grad/layer_24/mlp": 0.0079736253246665, "grad/layer_24/attn_mlp_ratio": 0.7190963296980324, "grad/layer_27/attn": 0.005904440768063068, "grad/layer_27/mlp": 0.007488274946808815, "grad/layer_27/attn_mlp_ratio": 0.7884914391037614} {"step": 13750, "timestamp": 1778340574.4348598, "train/loss": 2.3691264390945435, "train/z_loss": 0.0013532183831557632, "train/perplexity": 10.688051540869383, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024628.535969208, "perf/iters_per_sec": 0.9654181175085106, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358206272125243, "data/tokens_consumed": 28837937152, "data/tokens_consumed_B": 28.837937152, "train/loss_slope": 3.8152253488288424e-06} {"step": 13760, "timestamp": 1778340585.230203, "train/loss": 2.3495349884033203, "train/z_loss": 0.0013622664846479893, "train/perplexity": 10.480694946709779, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944050.9172863925, "perf/iters_per_sec": 0.9269957148010218, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0787536382675171, "data/tokens_consumed": 28858908672, "data/tokens_consumed_B": 28.858908672, "train/loss_slope": 3.879582499227416e-06} {"step": 13770, "timestamp": 1778340595.5902078, "train/loss": 2.3300758600234985, "train/z_loss": 0.001347444369457662, "train/perplexity": 10.278721247503865, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025390.1528184, "perf/iters_per_sec": 0.9657812847225189, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354311227798463, "data/tokens_consumed": 28879880192, "data/tokens_consumed_B": 28.879880192, "train/loss_slope": 3.1871593932006847e-06} {"step": 13780, "timestamp": 1778340605.9336023, "train/loss": 2.3357142925262453, "train/z_loss": 0.0013723773183301092, "train/perplexity": 10.336840821141326, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028584.7123284664, "perf/iters_per_sec": 0.9673045694010097, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338005542755127, "data/tokens_consumed": 28900851712, "data/tokens_consumed_B": 28.900851712, "train/loss_slope": 4.430989206212815e-06} {"step": 13790, "timestamp": 1778340616.2817082, "train/loss": 2.3343841552734377, "train/z_loss": 0.0013619172153994442, "train/perplexity": 10.323100544341651, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027870.9943275312, "perf/iters_per_sec": 0.9669642421376854, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341644048690797, "data/tokens_consumed": 28921823232, "data/tokens_consumed_B": 28.921823232, "train/loss_slope": 1.7888146837373974e-06} {"step": 13800, "timestamp": 1778340626.61843, "grad/layer_0/attn": 0.0028364108875393867, "grad/layer_0/mlp": 0.0032052802853286266, "grad/layer_0/attn_mlp_ratio": 0.8849181808001421, "grad/layer_4/attn": 0.0022745190653949976, "grad/layer_4/mlp": 0.0025608881842345, "grad/layer_4/attn_mlp_ratio": 0.8881758253171501, "grad/layer_8/attn": 0.00381011632271111, "grad/layer_8/mlp": 0.0033954791724681854, "grad/layer_8/attn_mlp_ratio": 1.1221144401042165, "grad/layer_12/attn": 0.007183082867413759, "grad/layer_12/mlp": 0.006186663638800383, "grad/layer_12/attn_mlp_ratio": 1.1610592026142004, "grad/layer_16/attn": 0.004011494107544422, "grad/layer_16/mlp": 0.004421575926244259, "grad/layer_16/attn_mlp_ratio": 0.9072543554004736, "grad/layer_20/attn": 0.0036038977559655905, "grad/layer_20/mlp": 0.005437020678073168, "grad/layer_20/attn_mlp_ratio": 0.6628442124958699, "grad/layer_24/attn": 0.008162624202668667, "grad/layer_24/mlp": 0.008415043354034424, "grad/layer_24/attn_mlp_ratio": 0.970003808923323, "grad/layer_27/attn": 0.005059388466179371, "grad/layer_27/mlp": 0.00906950980424881, "grad/layer_27/attn_mlp_ratio": 0.5578458505028139} {"step": 13800, "timestamp": 1778340627.2347236, "eos/sharpness": 49.45511817932128, "eos/L0_probe": 2.3072173595428467, "eos/L_plus": 2.541156530380249, "eos/L_minus": 2.5678293704986572, "eos/grad_norm": 0.12871068716049194, "eos/embed_grad_frac": 0.14266176521778107, "eos/time_s": 0.613478422164917} {"step": 13800, "timestamp": 1778340627.2545485, "train/loss": 2.3323693752288817, "train/z_loss": 0.0013717833906412124, "train/perplexity": 10.302322705782904, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912057.9634824267, "perf/iters_per_sec": 0.9117402856266149, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.096803569793701, "data/tokens_consumed": 28942794752, "data/tokens_consumed_B": 28.942794752, "train/loss_slope": 2.0680194640710735e-06} {"step": 13800, "timestamp": 1778340628.6185496, "geo/rankme_last": 429.3948974609375, "geo/layer_0/stable_rank_q_proj": 20.761995315551758, "geo/layer_0/stable_rank_k_proj": 16.906213760375977, "geo/layer_0/stable_rank_o_proj": 43.938411712646484, "geo/layer_0/stable_rank_gate_proj": 124.21748352050781, "geo/layer_0/stable_rank_down_proj": 57.80342483520508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06546758115291595, "geo/layer_0/attn_entropy_mean": 6.229755878448486, "geo/layer_0/attn_entropy_std": 0.4604513347148895, "geo/layer_7/stable_rank_q_proj": 41.84653854370117, "geo/layer_7/stable_rank_k_proj": 38.878971099853516, "geo/layer_7/stable_rank_o_proj": 88.50735473632812, "geo/layer_7/stable_rank_gate_proj": 77.91756439208984, "geo/layer_7/stable_rank_down_proj": 144.30201721191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39321303367614746, "geo/layer_7/attn_entropy_mean": 4.724495887756348, "geo/layer_7/attn_entropy_std": 0.7792645692825317, "geo/layer_14/stable_rank_q_proj": 51.75394821166992, "geo/layer_14/stable_rank_k_proj": 43.89476013183594, "geo/layer_14/stable_rank_o_proj": 42.304656982421875, "geo/layer_14/stable_rank_gate_proj": 71.82659149169922, "geo/layer_14/stable_rank_down_proj": 127.02933502197266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3624374568462372, "geo/layer_14/attn_entropy_mean": 5.556389808654785, "geo/layer_14/attn_entropy_std": 0.46593883633613586, "geo/layer_21/stable_rank_q_proj": 38.255279541015625, "geo/layer_21/stable_rank_k_proj": 28.656423568725586, "geo/layer_21/stable_rank_o_proj": 65.05682373046875, "geo/layer_21/stable_rank_gate_proj": 59.78923797607422, "geo/layer_21/stable_rank_down_proj": 48.64105224609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13532130420207977, "geo/layer_21/attn_entropy_mean": 5.871405124664307, "geo/layer_21/attn_entropy_std": 0.3363378345966339, "geo/layer_27/stable_rank_q_proj": 44.86019515991211, "geo/layer_27/stable_rank_k_proj": 30.61203956604004, "geo/layer_27/stable_rank_o_proj": 106.81806182861328, "geo/layer_27/stable_rank_gate_proj": 69.45011901855469, "geo/layer_27/stable_rank_down_proj": 130.25523376464844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09127688407897949, "geo/layer_27/attn_entropy_mean": 4.311474800109863, "geo/layer_27/attn_entropy_std": 0.6740214228630066, "attnres/final_alpha/block_0": 0.26390528678894043, "attnres/block_norm/0": 1.7829616069793701, "attnres/final_alpha/block_1": 0.003848866792395711, "attnres/block_norm/1": 50764.3046875, "attnres/final_alpha/block_2": 0.008049298077821732, "attnres/block_norm/2": 30030.380859375, "attnres/final_alpha/block_3": 0.010424088686704636, "attnres/block_norm/3": 73112.953125, "attnres/final_alpha/block_4": 0.011960659176111221, "attnres/block_norm/4": 17708.484375, "attnres/final_alpha/block_5": 0.6024526357650757, "attnres/block_norm/5": 7284.2275390625, "attnres/final_alpha/block_6": 0.09935912489891052, "attnres/block_norm/6": 49328.87109375, "geo/tier1_time_s": 1.3596971035003662, "geo/step": 13800.0, "geo/rankme_slope": 0.00015698273450005} {"step": 13810, "timestamp": 1778340638.9679458, "train/loss": 2.3308005809783934, "train/z_loss": 0.001371167809702456, "train/perplexity": 10.286173152131015, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791020.0641620655, "perf/iters_per_sec": 0.8540249176798179, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.170926022529602, "data/tokens_consumed": 28963766272, "data/tokens_consumed_B": 28.963766272, "train/loss_slope": 2.25839991130311e-06} {"step": 13820, "timestamp": 1778340649.8502696, "train/loss": 2.3259177446365356, "train/z_loss": 0.0013734404579736291, "train/perplexity": 10.236069874644429, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928317.0033271008, "perf/iters_per_sec": 0.9194931999812607, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0875556230545045, "data/tokens_consumed": 28984737792, "data/tokens_consumed_B": 28.984737792, "train/loss_slope": 3.063618754110239e-06} {"step": 13830, "timestamp": 1778340660.1996286, "train/loss": 2.335043501853943, "train/z_loss": 0.0013676249771378934, "train/perplexity": 10.329909289800625, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027517.66515047, "perf/iters_per_sec": 0.9667957616569853, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343446254730224, "data/tokens_consumed": 29005709312, "data/tokens_consumed_B": 29.005709312, "train/loss_slope": 4.300219011921982e-06} {"step": 13840, "timestamp": 1778340670.573018, "train/loss": 2.303684139251709, "train/z_loss": 0.0013615273055620492, "train/perplexity": 10.010996504303192, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022826.2866871979, "perf/iters_per_sec": 0.9645587380825033, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367434978485108, "data/tokens_consumed": 29026680832, "data/tokens_consumed_B": 29.026680832, "train/loss_slope": 4.742201033419733e-06} {"step": 13850, "timestamp": 1778340680.9262104, "grad/layer_0/attn": 0.002598919440060854, "grad/layer_0/mlp": 0.003078452544286847, "grad/layer_0/attn_mlp_ratio": 0.8442291437823688, "grad/layer_4/attn": 0.002094089752063155, "grad/layer_4/mlp": 0.0026221624575555325, "grad/layer_4/attn_mlp_ratio": 0.7986117207071005, "grad/layer_8/attn": 0.004906913731247187, "grad/layer_8/mlp": 0.003533506067469716, "grad/layer_8/attn_mlp_ratio": 1.3886812414313527, "grad/layer_12/attn": 0.005551918875426054, "grad/layer_12/mlp": 0.007576078176498413, "grad/layer_12/attn_mlp_ratio": 0.7328222693591406, "grad/layer_16/attn": 0.0034886698704212904, "grad/layer_16/mlp": 0.004510489758104086, "grad/layer_16/attn_mlp_ratio": 0.7734569814303272, "grad/layer_20/attn": 0.0032119860406965017, "grad/layer_20/mlp": 0.0064795901998877525, "grad/layer_20/attn_mlp_ratio": 0.4957081994446692, "grad/layer_24/attn": 0.013176979497075081, "grad/layer_24/mlp": 0.01146686915308237, "grad/layer_24/attn_mlp_ratio": 1.1491348864497624, "grad/layer_27/attn": 0.0066774035803973675, "grad/layer_27/mlp": 0.01240929402410984, "grad/layer_27/attn_mlp_ratio": 0.5380969709972412} {"step": 13850, "timestamp": 1778340680.941385, "train/loss": 2.352438282966614, "train/z_loss": 0.0013569532777182758, "train/perplexity": 10.511167705660922, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024526.2506094403, "perf/iters_per_sec": 0.9653693440482332, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358729600906371, "data/tokens_consumed": 29047652352, "data/tokens_consumed_B": 29.047652352, "train/loss_slope": 5.161294699645197e-06} {"step": 13860, "timestamp": 1778340691.3009698, "train/loss": 2.318395662307739, "train/z_loss": 0.0013734039384871721, "train/perplexity": 10.159362176820645, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025492.8986242313, "perf/iters_per_sec": 0.9658302777405888, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03537859916687, "data/tokens_consumed": 29068623872, "data/tokens_consumed_B": 29.068623872, "train/loss_slope": 4.5149169953922115e-06} {"step": 13870, "timestamp": 1778340701.680331, "train/loss": 2.3768981218338014, "train/z_loss": 0.0013577676261775195, "train/perplexity": 10.771439298432544, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021950.9050428707, "perf/iters_per_sec": 0.9641413235868791, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371923446655273, "data/tokens_consumed": 29089595392, "data/tokens_consumed_B": 29.089595392, "train/loss_slope": 7.4231599996013855e-06} {"step": 13875, "timestamp": 1778340707.4451845, "eos/sharpness": 46.40963077545165, "eos/L0_probe": 2.3057656288146973, "eos/L_plus": 2.5609967708587646, "eos/L_minus": 2.5146307945251465, "eos/grad_norm": 0.14197036623954773, "eos/embed_grad_frac": 0.1158895418047905, "eos/time_s": 0.5937447547912598} {"step": 13875, "timestamp": 1778340708.824837, "geo/rankme_last": 428.8565368652344, "geo/layer_0/stable_rank_q_proj": 20.752696990966797, "geo/layer_0/stable_rank_k_proj": 16.86675262451172, "geo/layer_0/stable_rank_o_proj": 43.947879791259766, "geo/layer_0/stable_rank_gate_proj": 124.31500244140625, "geo/layer_0/stable_rank_down_proj": 57.90934371948242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06729850172996521, "geo/layer_0/attn_entropy_mean": 6.229496002197266, "geo/layer_0/attn_entropy_std": 0.4574053883552551, "geo/layer_7/stable_rank_q_proj": 41.76079559326172, "geo/layer_7/stable_rank_k_proj": 38.80961608886719, "geo/layer_7/stable_rank_o_proj": 88.44214630126953, "geo/layer_7/stable_rank_gate_proj": 77.9762191772461, "geo/layer_7/stable_rank_down_proj": 144.03317260742188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.408596932888031, "geo/layer_7/attn_entropy_mean": 4.752965927124023, "geo/layer_7/attn_entropy_std": 0.7872378826141357, "geo/layer_14/stable_rank_q_proj": 51.666404724121094, "geo/layer_14/stable_rank_k_proj": 43.82752227783203, "geo/layer_14/stable_rank_o_proj": 42.34602737426758, "geo/layer_14/stable_rank_gate_proj": 71.71845245361328, "geo/layer_14/stable_rank_down_proj": 127.13321685791016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37610340118408203, "geo/layer_14/attn_entropy_mean": 5.546894073486328, "geo/layer_14/attn_entropy_std": 0.4712463617324829, "geo/layer_21/stable_rank_q_proj": 38.230804443359375, "geo/layer_21/stable_rank_k_proj": 28.598838806152344, "geo/layer_21/stable_rank_o_proj": 65.01278686523438, "geo/layer_21/stable_rank_gate_proj": 59.70595169067383, "geo/layer_21/stable_rank_down_proj": 48.60652542114258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13457533717155457, "geo/layer_21/attn_entropy_mean": 5.857382774353027, "geo/layer_21/attn_entropy_std": 0.32411158084869385, "geo/layer_27/stable_rank_q_proj": 44.825130462646484, "geo/layer_27/stable_rank_k_proj": 30.696712493896484, "geo/layer_27/stable_rank_o_proj": 106.77925872802734, "geo/layer_27/stable_rank_gate_proj": 69.39688110351562, "geo/layer_27/stable_rank_down_proj": 130.02975463867188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09847510606050491, "geo/layer_27/attn_entropy_mean": 4.325161933898926, "geo/layer_27/attn_entropy_std": 0.6781883835792542, "attnres/final_alpha/block_0": 0.2623017430305481, "attnres/block_norm/0": 1.7830619812011719, "attnres/final_alpha/block_1": 0.0038119456730782986, "attnres/block_norm/1": 50887.75390625, "attnres/final_alpha/block_2": 0.008053343743085861, "attnres/block_norm/2": 30038.8671875, "attnres/final_alpha/block_3": 0.010308688506484032, "attnres/block_norm/3": 73364.609375, "attnres/final_alpha/block_4": 0.011615915223956108, "attnres/block_norm/4": 17679.365234375, "attnres/final_alpha/block_5": 0.6057150363922119, "attnres/block_norm/5": 7258.6142578125, "attnres/final_alpha/block_6": 0.09819331765174866, "attnres/block_norm/6": 49491.52734375, "geo/tier1_time_s": 1.3613224029541016, "geo/step": 13875.0, "geo/rankme_slope": 0.00012292209852691075} {"step": 13880, "timestamp": 1778340714.0062675, "train/loss": 2.2874641180038453, "train/z_loss": 0.001371255691628903, "train/perplexity": 9.849927729040633, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702409.0773381013, "perf/iters_per_sec": 0.8117719065371043, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.23187313079834, "data/tokens_consumed": 29110566912, "data/tokens_consumed_B": 29.110566912, "train/loss_slope": 5.555687385602621e-06} {"step": 13890, "timestamp": 1778340724.3690557, "train/loss": 2.353069567680359, "train/z_loss": 0.0013622135506011547, "train/perplexity": 10.517805340055244, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024738.661556473, "perf/iters_per_sec": 0.9654706294805875, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357642889022827, "data/tokens_consumed": 29131538432, "data/tokens_consumed_B": 29.131538432, "train/loss_slope": 8.534070717977286e-06} {"step": 13900, "timestamp": 1778340734.7244017, "grad/layer_0/attn": 0.0028891207184642553, "grad/layer_0/mlp": 0.003245149739086628, "grad/layer_0/attn_mlp_ratio": 0.8902888500450322, "grad/layer_4/attn": 0.0038470106665045023, "grad/layer_4/mlp": 0.002726619364693761, "grad/layer_4/attn_mlp_ratio": 1.410908532091983, "grad/layer_8/attn": 0.003607428167015314, "grad/layer_8/mlp": 0.003510221606120467, "grad/layer_8/attn_mlp_ratio": 1.0276923992365943, "grad/layer_12/attn": 0.0060836272314190865, "grad/layer_12/mlp": 0.007210792973637581, "grad/layer_12/attn_mlp_ratio": 0.8436835129357165, "grad/layer_16/attn": 0.005651416722685099, "grad/layer_16/mlp": 0.004238493740558624, "grad/layer_16/attn_mlp_ratio": 1.3333549452416462, "grad/layer_20/attn": 0.002942252904176712, "grad/layer_20/mlp": 0.005748883355408907, "grad/layer_20/attn_mlp_ratio": 0.5117955385594846, "grad/layer_24/attn": 0.0046056946739554405, "grad/layer_24/mlp": 0.007901409640908241, "grad/layer_24/attn_mlp_ratio": 0.582895309188969, "grad/layer_27/attn": 0.005228297784924507, "grad/layer_27/mlp": 0.006940997205674648, "grad/layer_27/attn_mlp_ratio": 0.7532487846739381} {"step": 13900, "timestamp": 1778340734.740535, "train/loss": 2.3284393548965454, "train/z_loss": 0.0013653607573360204, "train/perplexity": 10.261913823952275, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023099.993099545, "perf/iters_per_sec": 0.9646892514703488, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366032361984252, "data/tokens_consumed": 29152509952, "data/tokens_consumed_B": 29.152509952, "train/loss_slope": 5.691598093334448e-06} {"step": 13910, "timestamp": 1778340745.102168, "train/loss": 2.35949227809906, "train/z_loss": 0.0013739252230152489, "train/perplexity": 10.58557555927442, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025163.8969515597, "perf/iters_per_sec": 0.965673397517948, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355468034744262, "data/tokens_consumed": 29173481472, "data/tokens_consumed_B": 29.173481472, "train/loss_slope": 6.644990656635574e-06} {"step": 13920, "timestamp": 1778340755.4672568, "train/loss": 2.3754802942276, "train/z_loss": 0.0013614132651127875, "train/perplexity": 10.756178075884574, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024329.1649166662, "perf/iters_per_sec": 0.9652753662665683, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035973811149597, "data/tokens_consumed": 29194452992, "data/tokens_consumed_B": 29.194452992, "train/loss_slope": 8.363288792983023e-06} {"step": 13930, "timestamp": 1778340765.831238, "train/loss": 2.3387081384658814, "train/z_loss": 0.0013677842100150884, "train/perplexity": 10.367834101465906, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024607.2858594186, "perf/iters_per_sec": 0.9654079846665471, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358314990997315, "data/tokens_consumed": 29215424512, "data/tokens_consumed_B": 29.215424512, "train/loss_slope": 1.0411196158449495e-05} {"step": 13940, "timestamp": 1778340776.1860735, "train/loss": 2.3284793376922606, "train/z_loss": 0.0013551180833019315, "train/perplexity": 10.262324132158925, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026336.526478497, "perf/iters_per_sec": 0.9662325508491979, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349475383758544, "data/tokens_consumed": 29236396032, "data/tokens_consumed_B": 29.236396032, "train/loss_slope": 9.388391169706285e-06} {"step": 13950, "timestamp": 1778340786.5361326, "grad/layer_0/attn": 0.0028200254309922457, "grad/layer_0/mlp": 0.003249757457524538, "grad/layer_0/attn_mlp_ratio": 0.8677648658629059, "grad/layer_4/attn": 0.0018746007699519396, "grad/layer_4/mlp": 0.0025095846503973007, "grad/layer_4/attn_mlp_ratio": 0.746976474755501, "grad/layer_8/attn": 0.005201129242777824, "grad/layer_8/mlp": 0.003318304428830743, "grad/layer_8/attn_mlp_ratio": 1.5674056427275904, "grad/layer_12/attn": 0.005169129930436611, "grad/layer_12/mlp": 0.0068045915104448795, "grad/layer_12/attn_mlp_ratio": 0.7596532204081322, "grad/layer_16/attn": 0.0030276323668658733, "grad/layer_16/mlp": 0.0045178732834756374, "grad/layer_16/attn_mlp_ratio": 0.6701454666568568, "grad/layer_20/attn": 0.00301408302038908, "grad/layer_20/mlp": 0.0059548113495111465, "grad/layer_20/attn_mlp_ratio": 0.5061592706913529, "grad/layer_24/attn": 0.010365177877247334, "grad/layer_24/mlp": 0.009775293059647083, "grad/layer_24/attn_mlp_ratio": 1.0603444528942954, "grad/layer_27/attn": 0.007189639378339052, "grad/layer_27/mlp": 0.009144283831119537, "grad/layer_27/attn_mlp_ratio": 0.786244109707869} {"step": 13950, "timestamp": 1778340787.1308994, "eos/sharpness": 57.511758804321275, "eos/L0_probe": 2.3065710067749023, "eos/L_plus": 2.5749409198760986, "eos/L_minus": 2.613318681716919, "eos/grad_norm": 0.15974044799804688, "eos/embed_grad_frac": 0.08370159566402435, "eos/time_s": 0.5919082164764404} {"step": 13950, "timestamp": 1778340787.1505904, "train/loss": 2.334038329124451, "train/z_loss": 0.0013525342452339827, "train/perplexity": 10.319531163462994, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913541.2266874744, "perf/iters_per_sec": 0.9124475606381771, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.095953392982483, "data/tokens_consumed": 29257367552, "data/tokens_consumed_B": 29.257367552, "train/loss_slope": 1.1122789448744718e-05} {"step": 13950, "timestamp": 1778340788.516694, "geo/rankme_last": 429.26715087890625, "geo/layer_0/stable_rank_q_proj": 20.762434005737305, "geo/layer_0/stable_rank_k_proj": 16.852121353149414, "geo/layer_0/stable_rank_o_proj": 43.89183807373047, "geo/layer_0/stable_rank_gate_proj": 124.25492095947266, "geo/layer_0/stable_rank_down_proj": 57.888423919677734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06364038586616516, "geo/layer_0/attn_entropy_mean": 6.232499599456787, "geo/layer_0/attn_entropy_std": 0.4561992883682251, "geo/layer_7/stable_rank_q_proj": 41.72537612915039, "geo/layer_7/stable_rank_k_proj": 38.74899673461914, "geo/layer_7/stable_rank_o_proj": 88.42435455322266, "geo/layer_7/stable_rank_gate_proj": 78.16078186035156, "geo/layer_7/stable_rank_down_proj": 144.4184112548828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39398252964019775, "geo/layer_7/attn_entropy_mean": 4.735221862792969, "geo/layer_7/attn_entropy_std": 0.7696476578712463, "geo/layer_14/stable_rank_q_proj": 51.719356536865234, "geo/layer_14/stable_rank_k_proj": 43.93680953979492, "geo/layer_14/stable_rank_o_proj": 42.35847473144531, "geo/layer_14/stable_rank_gate_proj": 71.73871612548828, "geo/layer_14/stable_rank_down_proj": 126.81187438964844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39158666133880615, "geo/layer_14/attn_entropy_mean": 5.571687698364258, "geo/layer_14/attn_entropy_std": 0.472032755613327, "geo/layer_21/stable_rank_q_proj": 38.19401931762695, "geo/layer_21/stable_rank_k_proj": 28.556591033935547, "geo/layer_21/stable_rank_o_proj": 65.05590057373047, "geo/layer_21/stable_rank_gate_proj": 59.70820617675781, "geo/layer_21/stable_rank_down_proj": 48.60399627685547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1300361156463623, "geo/layer_21/attn_entropy_mean": 5.838216781616211, "geo/layer_21/attn_entropy_std": 0.33819037675857544, "geo/layer_27/stable_rank_q_proj": 44.90179443359375, "geo/layer_27/stable_rank_k_proj": 30.683731079101562, "geo/layer_27/stable_rank_o_proj": 106.80854797363281, "geo/layer_27/stable_rank_gate_proj": 69.42301177978516, "geo/layer_27/stable_rank_down_proj": 129.92684936523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09179306775331497, "geo/layer_27/attn_entropy_mean": 4.308785438537598, "geo/layer_27/attn_entropy_std": 0.689396321773529, "attnres/final_alpha/block_0": 0.2640238404273987, "attnres/block_norm/0": 1.7832398414611816, "attnres/final_alpha/block_1": 0.0038140425458550453, "attnres/block_norm/1": 50950.09765625, "attnres/final_alpha/block_2": 0.008236341178417206, "attnres/block_norm/2": 29890.7890625, "attnres/final_alpha/block_3": 0.01037001796066761, "attnres/block_norm/3": 73674.1328125, "attnres/final_alpha/block_4": 0.011874405667185783, "attnres/block_norm/4": 17651.322265625, "attnres/final_alpha/block_5": 0.6007857322692871, "attnres/block_norm/5": 7265.927734375, "attnres/final_alpha/block_6": 0.10089559853076935, "attnres/block_norm/6": 49400.484375, "geo/tier1_time_s": 1.362199306488037, "geo/step": 13950.0, "geo/rankme_slope": 0.00013753575649009603} {"step": 13960, "timestamp": 1778340798.8775477, "train/loss": 2.3589022874832155, "train/z_loss": 0.001351585320662707, "train/perplexity": 10.57933201102968, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788919.1707524, "perf/iters_per_sec": 0.8530231336366654, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1723011493682862, "data/tokens_consumed": 29278339072, "data/tokens_consumed_B": 29.278339072, "train/loss_slope": 1.2740756225700446e-05} {"step": 13970, "timestamp": 1778340809.2353404, "train/loss": 2.3749937295913695, "train/z_loss": 0.0013517107348889113, "train/perplexity": 10.75094577304184, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025822.4271205652, "perf/iters_per_sec": 0.9659874091723276, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352101802825928, "data/tokens_consumed": 29299310592, "data/tokens_consumed_B": 29.299310592, "train/loss_slope": 1.4653738718865513e-05} {"step": 13980, "timestamp": 1778340819.5972984, "train/loss": 2.3108176231384276, "train/z_loss": 0.001370603672694415, "train/perplexity": 10.082665106050523, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024869.9610789856, "perf/iters_per_sec": 0.9655332379717758, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356971263885497, "data/tokens_consumed": 29320282112, "data/tokens_consumed_B": 29.320282112, "train/loss_slope": 1.1178656479921082e-05} {"step": 13990, "timestamp": 1778340829.9620767, "train/loss": 2.358525109291077, "train/z_loss": 0.0013763188617303967, "train/perplexity": 10.575342470138933, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024706.8296966567, "perf/iters_per_sec": 0.9654554508670123, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357805728912353, "data/tokens_consumed": 29341253632, "data/tokens_consumed_B": 29.341253632, "train/loss_slope": 1.2788968468227468e-05} {"step": 14000, "timestamp": 1778340840.3157668, "grad/layer_0/attn": 0.0034151689615100622, "grad/layer_0/mlp": 0.003646255936473608, "grad/layer_0/attn_mlp_ratio": 0.9366234645477512, "grad/layer_4/attn": 0.0019810302183032036, "grad/layer_4/mlp": 0.0025909231044352055, "grad/layer_4/attn_mlp_ratio": 0.7646039893857247, "grad/layer_8/attn": 0.0033377427607774734, "grad/layer_8/mlp": 0.0035763252526521683, "grad/layer_8/attn_mlp_ratio": 0.9332883425447396, "grad/layer_12/attn": 0.005361218936741352, "grad/layer_12/mlp": 0.007276591379195452, "grad/layer_12/attn_mlp_ratio": 0.7367761337254739, "grad/layer_16/attn": 0.0033831594046205282, "grad/layer_16/mlp": 0.0047556147910654545, "grad/layer_16/attn_mlp_ratio": 0.7114031480927088, "grad/layer_20/attn": 0.0030505144968628883, "grad/layer_20/mlp": 0.00641333544626832, "grad/layer_20/attn_mlp_ratio": 0.4756517844505842, "grad/layer_24/attn": 0.0089438296854496, "grad/layer_24/mlp": 0.010597765445709229, "grad/layer_24/attn_mlp_ratio": 0.8439354170342761, "grad/layer_27/attn": 0.005973214283585548, "grad/layer_27/mlp": 0.010760894976556301, "grad/layer_27/attn_mlp_ratio": 0.5550852639199875} {"step": 14000, "timestamp": 1778340840.3332536, "train/loss": 2.3383163452148437, "train/z_loss": 0.0013765402836725116, "train/perplexity": 10.363772849674545, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023127.7261151953, "perf/iters_per_sec": 0.9647024756027199, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365890264511108, "data/tokens_consumed": 29362225152, "data/tokens_consumed_B": 29.362225152, "train/loss_slope": 1.3335848054905919e-05} {"step": 14000, "timestamp": 1778340847.3309202, "geo/ww_alpha_mean": 7.605597681391283, "geo/ww_alpha_std": 4.302656840283951, "geo/ww_alpha_min": 1.3541291085603773, "geo/ww_alpha_max": 27.018665179423802, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.9110292940166183, "geo/ww_alpha_by_type/k_proj": 4.633078983704942, "geo/ww_alpha_by_type/v_proj": 9.843254085435388, "geo/ww_alpha_by_type/o_proj": 8.126909172346815, "geo/ww_alpha_by_type/gate_proj": 7.780824829027423, "geo/ww_alpha_by_type/up_proj": 11.269700857402928, "geo/ww_alpha_by_type/down_proj": 7.769579221519668, "geo/twonn_id/layer_0": 0.7460981607437134, "geo/twonn_id/layer_7": 3.568214178085327, "geo/twonn_id/layer_14": 4.893798828125, "geo/twonn_id/layer_21": 8.485990524291992, "geo/twonn_id/layer_27": 6.238415718078613, "geo/tier2_time_s": 6.9893786907196045} {"step": 14000, "timestamp": 1778340848.1109123, "eoc/jacobian_sigma/layer_0/attn": 1563.1422119140625, "eoc/jacobian_sigma/layer_0/mlp": 11159.685546875, "eoc/jacobian_sigma/layer_0": 11159.685546875, "eoc/jacobian_sigma/layer_7/attn": 1.1466385126113892, "eoc/jacobian_sigma/layer_7/mlp": 1.8264057636260986, "eoc/jacobian_sigma/layer_7": 1.8264057636260986, "eoc/jacobian_sigma/layer_14/attn": 1.947012186050415, "eoc/jacobian_sigma/layer_14/mlp": 12.373289108276367, "eoc/jacobian_sigma/layer_14": 12.373289108276367, "eoc/jacobian_sigma/layer_21/attn": 1.0929844379425049, "eoc/jacobian_sigma/layer_21/mlp": 5.764552116394043, "eoc/jacobian_sigma/layer_21": 5.764552116394043, "eoc/jacobian_sigma/layer_27/attn": 3.9347593784332275, "eoc/jacobian_sigma/layer_27/mlp": 42.79847717285156, "eoc/jacobian_sigma/layer_27": 42.79847717285156, "eoc/layer0_sigma": 11159.685546875, "eoc/sigma_max": 42.79847717285156, "eoc/sigma_min": 1.8264057636260986, "eoc/sigma_mean": 15.690681040287018, "eoc/time_s": 0.7720370292663574} {"step": 14010, "timestamp": 1778340858.515447, "train/loss": 2.3200912475585938, "train/z_loss": 0.0013627783861011266, "train/perplexity": 10.176602853874025, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1153876.5805630574, "perf/iters_per_sec": 0.5502112295928275, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8174838066101073, "data/tokens_consumed": 29383196672, "data/tokens_consumed_B": 29.383196672, "train/loss_slope": 1.35554150708116e-05} {"step": 14020, "timestamp": 1778340868.9030464, "train/loss": 2.325845742225647, "train/z_loss": 0.0013598461053334176, "train/perplexity": 10.235332879468462, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020221.7145097698, "perf/iters_per_sec": 0.9633167812870835, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0380801200866698, "data/tokens_consumed": 29404168192, "data/tokens_consumed_B": 29.404168192, "train/loss_slope": 1.2244001366231594e-05} {"step": 14025, "timestamp": 1778340874.669982, "eos/sharpness": 63.62802982330321, "eos/L0_probe": 2.3109853267669678, "eos/L_plus": 2.6115734577178955, "eos/L_minus": 2.6466774940490723, "eos/grad_norm": 0.23339910805225372, "eos/embed_grad_frac": 0.04231946915388107, "eos/time_s": 0.5990304946899414} {"step": 14025, "timestamp": 1778340876.0506637, "geo/rankme_last": 429.37872314453125, "geo/layer_0/stable_rank_q_proj": 20.77775001525879, "geo/layer_0/stable_rank_k_proj": 16.887413024902344, "geo/layer_0/stable_rank_o_proj": 43.948570251464844, "geo/layer_0/stable_rank_gate_proj": 124.02094268798828, "geo/layer_0/stable_rank_down_proj": 57.924644470214844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0664714053273201, "geo/layer_0/attn_entropy_mean": 6.236085891723633, "geo/layer_0/attn_entropy_std": 0.4593350291252136, "geo/layer_7/stable_rank_q_proj": 41.713043212890625, "geo/layer_7/stable_rank_k_proj": 38.656795501708984, "geo/layer_7/stable_rank_o_proj": 88.262451171875, "geo/layer_7/stable_rank_gate_proj": 78.09960174560547, "geo/layer_7/stable_rank_down_proj": 144.60256958007812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39895835518836975, "geo/layer_7/attn_entropy_mean": 4.732267379760742, "geo/layer_7/attn_entropy_std": 0.7710813879966736, "geo/layer_14/stable_rank_q_proj": 51.62275314331055, "geo/layer_14/stable_rank_k_proj": 43.858585357666016, "geo/layer_14/stable_rank_o_proj": 42.39537048339844, "geo/layer_14/stable_rank_gate_proj": 71.78162384033203, "geo/layer_14/stable_rank_down_proj": 126.59574127197266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37875866889953613, "geo/layer_14/attn_entropy_mean": 5.525996208190918, "geo/layer_14/attn_entropy_std": 0.49410077929496765, "geo/layer_21/stable_rank_q_proj": 38.15531921386719, "geo/layer_21/stable_rank_k_proj": 28.595687866210938, "geo/layer_21/stable_rank_o_proj": 65.03817749023438, "geo/layer_21/stable_rank_gate_proj": 59.695491790771484, "geo/layer_21/stable_rank_down_proj": 48.61967086791992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13843655586242676, "geo/layer_21/attn_entropy_mean": 5.8285675048828125, "geo/layer_21/attn_entropy_std": 0.3351020812988281, "geo/layer_27/stable_rank_q_proj": 44.97535705566406, "geo/layer_27/stable_rank_k_proj": 30.649736404418945, "geo/layer_27/stable_rank_o_proj": 106.4985122680664, "geo/layer_27/stable_rank_gate_proj": 69.46357727050781, "geo/layer_27/stable_rank_down_proj": 129.98233032226562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1020236685872078, "geo/layer_27/attn_entropy_mean": 4.320542812347412, "geo/layer_27/attn_entropy_std": 0.6978429555892944, "attnres/final_alpha/block_0": 0.26380228996276855, "attnres/block_norm/0": 1.7832911014556885, "attnres/final_alpha/block_1": 0.003835295094177127, "attnres/block_norm/1": 50870.63671875, "attnres/final_alpha/block_2": 0.008290272206068039, "attnres/block_norm/2": 30030.99609375, "attnres/final_alpha/block_3": 0.01055191457271576, "attnres/block_norm/3": 73464.59375, "attnres/final_alpha/block_4": 0.011912340298295021, "attnres/block_norm/4": 17746.73828125, "attnres/final_alpha/block_5": 0.6009972095489502, "attnres/block_norm/5": 7294.2177734375, "attnres/final_alpha/block_6": 0.10061068832874298, "attnres/block_norm/6": 49494.7421875, "geo/tier1_time_s": 1.360959529876709, "geo/step": 14025.0, "geo/rankme_slope": 0.0001247470667954682} {"step": 14030, "timestamp": 1778340881.229147, "train/loss": 2.284401202201843, "train/z_loss": 0.0013775138068012893, "train/perplexity": 9.81980438593264, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702311.9501669896, "perf/iters_per_sec": 0.811725592692847, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.231943416595459, "data/tokens_consumed": 29425139712, "data/tokens_consumed_B": 29.425139712, "train/loss_slope": 9.481214368232478e-06} {"step": 14040, "timestamp": 1778340891.5824122, "train/loss": 2.303900647163391, "train/z_loss": 0.0013666022103279829, "train/perplexity": 10.013164198903242, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026865.084937791, "perf/iters_per_sec": 0.9664845871628719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346776485443114, "data/tokens_consumed": 29446111232, "data/tokens_consumed_B": 29.446111232, "train/loss_slope": 7.326567584317321e-06} {"step": 14050, "timestamp": 1778340901.9318264, "grad/layer_0/attn": 0.0029540567193180323, "grad/layer_0/mlp": 0.0032522580586373806, "grad/layer_0/attn_mlp_ratio": 0.9083094192484745, "grad/layer_4/attn": 0.002262903843075037, "grad/layer_4/mlp": 0.0026317366864532232, "grad/layer_4/attn_mlp_ratio": 0.8598518874392221, "grad/layer_8/attn": 0.004230147693306208, "grad/layer_8/mlp": 0.0035284576006233692, "grad/layer_8/attn_mlp_ratio": 1.1988659216628477, "grad/layer_12/attn": 0.009123800322413445, "grad/layer_12/mlp": 0.007387634366750717, "grad/layer_12/attn_mlp_ratio": 1.2350097129841267, "grad/layer_16/attn": 0.0033064528834074736, "grad/layer_16/mlp": 0.004810893442481756, "grad/layer_16/attn_mlp_ratio": 0.6872845666216512, "grad/layer_20/attn": 0.004530897829681635, "grad/layer_20/mlp": 0.006040382198989391, "grad/layer_20/attn_mlp_ratio": 0.7501011699937757, "grad/layer_24/attn": 0.010859747417271137, "grad/layer_24/mlp": 0.009457251988351345, "grad/layer_24/attn_mlp_ratio": 1.1482983974432985, "grad/layer_27/attn": 0.006047178991138935, "grad/layer_27/mlp": 0.00815355870872736, "grad/layer_27/attn_mlp_ratio": 0.7416612957603482} {"step": 14050, "timestamp": 1778340901.9459658, "train/loss": 2.33693323135376, "train/z_loss": 0.001369488553609699, "train/perplexity": 10.349448480193574, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024497.5007313294, "perf/iters_per_sec": 0.9653556350380561, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358876705169677, "data/tokens_consumed": 29467082752, "data/tokens_consumed_B": 29.467082752, "train/loss_slope": 6.130703383296095e-06} {"step": 14060, "timestamp": 1778340912.3123126, "train/loss": 2.3101179122924806, "train/z_loss": 0.0013553799013607203, "train/perplexity": 10.075612623556752, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024461.4363986584, "perf/iters_per_sec": 0.9653384382241528, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359061241149903, "data/tokens_consumed": 29488054272, "data/tokens_consumed_B": 29.488054272, "train/loss_slope": 6.428131411964745e-06} {"step": 14070, "timestamp": 1778340922.654803, "train/loss": 2.3634270429611206, "train/z_loss": 0.0013597820186987519, "train/perplexity": 10.627309362536206, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029281.7512580906, "perf/iters_per_sec": 0.9676369434633687, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033445453643799, "data/tokens_consumed": 29509025792, "data/tokens_consumed_B": 29.509025792, "train/loss_slope": 7.5039400340485625e-06} {"step": 14080, "timestamp": 1778340933.0050094, "train/loss": 2.357455086708069, "train/z_loss": 0.0013612348702736199, "train/perplexity": 10.564032666824415, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027165.300009571, "perf/iters_per_sec": 0.9666277408645492, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034524416923523, "data/tokens_consumed": 29529997312, "data/tokens_consumed_B": 29.529997312, "train/loss_slope": 8.312594507894397e-06} {"step": 14090, "timestamp": 1778340943.3546832, "train/loss": 2.307715082168579, "train/z_loss": 0.0013673976296558975, "train/perplexity": 10.051431700986926, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027591.6956217806, "perf/iters_per_sec": 0.9668310621365455, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343068599700929, "data/tokens_consumed": 29550968832, "data/tokens_consumed_B": 29.550968832, "train/loss_slope": 7.031084058379883e-06} {"step": 14100, "timestamp": 1778340953.6981978, "grad/layer_0/attn": 0.0027582934126257896, "grad/layer_0/mlp": 0.0031063067726790905, "grad/layer_0/attn_mlp_ratio": 0.8879655248764426, "grad/layer_4/attn": 0.002573787234723568, "grad/layer_4/mlp": 0.002561792265623808, "grad/layer_4/attn_mlp_ratio": 1.0046822175211063, "grad/layer_8/attn": 0.0032225262839347124, "grad/layer_8/mlp": 0.003337275702506304, "grad/layer_8/attn_mlp_ratio": 0.9656158120088804, "grad/layer_12/attn": 0.00688970135524869, "grad/layer_12/mlp": 0.006840888876467943, "grad/layer_12/attn_mlp_ratio": 1.0071353853203373, "grad/layer_16/attn": 0.004472578875720501, "grad/layer_16/mlp": 0.004746445454657078, "grad/layer_16/attn_mlp_ratio": 0.9423006804180306, "grad/layer_20/attn": 0.0032512634061276913, "grad/layer_20/mlp": 0.00750852981582284, "grad/layer_20/attn_mlp_ratio": 0.4330093164144228, "grad/layer_24/attn": 0.024019407108426094, "grad/layer_24/mlp": 0.018217366188764572, "grad/layer_24/attn_mlp_ratio": 1.3184895515462016, "grad/layer_27/attn": 0.020176338031888008, "grad/layer_27/mlp": 0.019155465066432953, "grad/layer_27/attn_mlp_ratio": 1.0532940785611398} {"step": 14100, "timestamp": 1778340954.2837167, "eos/sharpness": 73.15237522125243, "eos/L0_probe": 2.304736614227295, "eos/L_plus": 2.749927043914795, "eos/L_minus": 2.5910699367523193, "eos/grad_norm": 0.37815484404563904, "eos/embed_grad_frac": 0.015417326241731644, "eos/time_s": 0.5827441215515137} {"step": 14100, "timestamp": 1778340954.301174, "train/loss": 2.32833468914032, "train/z_loss": 0.001360963541083038, "train/perplexity": 10.260839809188834, "train/grad_norm": 0.37890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917028.171845069, "perf/iters_per_sec": 0.9141102656579346, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.093959927558899, "data/tokens_consumed": 29571940352, "data/tokens_consumed_B": 29.571940352, "train/loss_slope": 5.171433290561108e-06} {"step": 14100, "timestamp": 1778340955.6659894, "geo/rankme_last": 429.6822509765625, "geo/layer_0/stable_rank_q_proj": 20.766054153442383, "geo/layer_0/stable_rank_k_proj": 16.865154266357422, "geo/layer_0/stable_rank_o_proj": 43.89765548706055, "geo/layer_0/stable_rank_gate_proj": 124.05412292480469, "geo/layer_0/stable_rank_down_proj": 57.94615936279297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06411140412092209, "geo/layer_0/attn_entropy_mean": 6.236484527587891, "geo/layer_0/attn_entropy_std": 0.4584534168243408, "geo/layer_7/stable_rank_q_proj": 41.694305419921875, "geo/layer_7/stable_rank_k_proj": 38.62320327758789, "geo/layer_7/stable_rank_o_proj": 88.4199447631836, "geo/layer_7/stable_rank_gate_proj": 77.99869537353516, "geo/layer_7/stable_rank_down_proj": 144.3243865966797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38637110590934753, "geo/layer_7/attn_entropy_mean": 4.722169876098633, "geo/layer_7/attn_entropy_std": 0.777694821357727, "geo/layer_14/stable_rank_q_proj": 51.50453567504883, "geo/layer_14/stable_rank_k_proj": 43.834266662597656, "geo/layer_14/stable_rank_o_proj": 42.43434524536133, "geo/layer_14/stable_rank_gate_proj": 71.89449310302734, "geo/layer_14/stable_rank_down_proj": 126.63890838623047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3715629577636719, "geo/layer_14/attn_entropy_mean": 5.542549133300781, "geo/layer_14/attn_entropy_std": 0.4768632650375366, "geo/layer_21/stable_rank_q_proj": 38.216121673583984, "geo/layer_21/stable_rank_k_proj": 28.520341873168945, "geo/layer_21/stable_rank_o_proj": 64.96424865722656, "geo/layer_21/stable_rank_gate_proj": 59.748966217041016, "geo/layer_21/stable_rank_down_proj": 48.6911506652832, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13778918981552124, "geo/layer_21/attn_entropy_mean": 5.86799430847168, "geo/layer_21/attn_entropy_std": 0.3273310363292694, "geo/layer_27/stable_rank_q_proj": 45.08460998535156, "geo/layer_27/stable_rank_k_proj": 30.65176773071289, "geo/layer_27/stable_rank_o_proj": 106.67296600341797, "geo/layer_27/stable_rank_gate_proj": 69.45580291748047, "geo/layer_27/stable_rank_down_proj": 130.0902099609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09874588251113892, "geo/layer_27/attn_entropy_mean": 4.33060359954834, "geo/layer_27/attn_entropy_std": 0.6852748394012451, "attnres/final_alpha/block_0": 0.2610774636268616, "attnres/block_norm/0": 1.7832030057907104, "attnres/final_alpha/block_1": 0.003725726157426834, "attnres/block_norm/1": 50756.9921875, "attnres/final_alpha/block_2": 0.00811451580375433, "attnres/block_norm/2": 30007.7109375, "attnres/final_alpha/block_3": 0.010542627424001694, "attnres/block_norm/3": 73871.6640625, "attnres/final_alpha/block_4": 0.011758575215935707, "attnres/block_norm/4": 17628.916015625, "attnres/final_alpha/block_5": 0.6075131297111511, "attnres/block_norm/5": 7204.42431640625, "attnres/final_alpha/block_6": 0.09726797789335251, "attnres/block_norm/6": 49511.18359375, "geo/tier1_time_s": 1.3611443042755127, "geo/step": 14100.0, "geo/rankme_slope": 0.00011329131261879752} {"step": 14110, "timestamp": 1778340966.0227566, "train/loss": 2.3629762172698974, "train/z_loss": 0.001372890721540898, "train/perplexity": 10.622519378252113, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789688.0797938518, "perf/iters_per_sec": 0.8533897780389079, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1717974901199342, "data/tokens_consumed": 29592911872, "data/tokens_consumed_B": 29.592911872, "train/loss_slope": 7.5421002402593185e-06} {"step": 14120, "timestamp": 1778340976.380987, "train/loss": 2.351426362991333, "train/z_loss": 0.0013565464061684907, "train/perplexity": 10.500536624904694, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025994.6045468955, "perf/iters_per_sec": 0.9660695097670057, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351222038269043, "data/tokens_consumed": 29613883392, "data/tokens_consumed_B": 29.613883392, "train/loss_slope": 7.336756162302949e-06} {"step": 14130, "timestamp": 1778340986.7361922, "train/loss": 2.3030585289001464, "train/z_loss": 0.0013722278294153512, "train/perplexity": 10.004735479945674, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026651.1069480772, "perf/iters_per_sec": 0.9663825545063387, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347868919372558, "data/tokens_consumed": 29634854912, "data/tokens_consumed_B": 29.634854912, "train/loss_slope": 4.64205275488945e-06} {"step": 14140, "timestamp": 1778340997.0888195, "train/loss": 2.3322762489318847, "train/z_loss": 0.001364694128278643, "train/perplexity": 10.301363333290944, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026726.1015109918, "perf/iters_per_sec": 0.9664183147005996, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347486019134522, "data/tokens_consumed": 29655826432, "data/tokens_consumed_B": 29.655826432, "train/loss_slope": 5.783611584310584e-06} {"step": 14150, "timestamp": 1778341007.4390025, "grad/layer_0/attn": 0.0028174580074846745, "grad/layer_0/mlp": 0.003266303101554513, "grad/layer_0/attn_mlp_ratio": 0.8625831203128288, "grad/layer_4/attn": 0.0017035442870110273, "grad/layer_4/mlp": 0.0025872462429106236, "grad/layer_4/attn_mlp_ratio": 0.6584391516018373, "grad/layer_8/attn": 0.0072358171455562115, "grad/layer_8/mlp": 0.0032168535981327295, "grad/layer_8/attn_mlp_ratio": 2.249346045720497, "grad/layer_12/attn": 0.006805691868066788, "grad/layer_12/mlp": 0.007019070442765951, "grad/layer_12/attn_mlp_ratio": 0.9696001524134734, "grad/layer_16/attn": 0.004126155283302069, "grad/layer_16/mlp": 0.004086448345333338, "grad/layer_16/attn_mlp_ratio": 1.0097167108553824, "grad/layer_20/attn": 0.002433540066704154, "grad/layer_20/mlp": 0.005140247754752636, "grad/layer_20/attn_mlp_ratio": 0.47342854575662546, "grad/layer_24/attn": 0.00671561760827899, "grad/layer_24/mlp": 0.009084228426218033, "grad/layer_24/attn_mlp_ratio": 0.7392611919544974, "grad/layer_27/attn": 0.0043800813145935535, "grad/layer_27/mlp": 0.008370282128453255, "grad/layer_27/attn_mlp_ratio": 0.5232895612174536} {"step": 14150, "timestamp": 1778341007.4530299, "train/loss": 2.3795161485671996, "train/z_loss": 0.001368339464534074, "train/perplexity": 10.799676160775151, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024754.927417355, "perf/iters_per_sec": 0.9654783856474661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357559680938722, "data/tokens_consumed": 29676797952, "data/tokens_consumed_B": 29.676797952, "train/loss_slope": 1.067807566393539e-05} {"step": 14160, "timestamp": 1778341017.8072104, "train/loss": 2.3364415884017946, "train/z_loss": 0.0013712588348425924, "train/perplexity": 10.344361497383648, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026796.2446618585, "perf/iters_per_sec": 0.9664517615613263, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034712791442871, "data/tokens_consumed": 29697769472, "data/tokens_consumed_B": 29.697769472, "train/loss_slope": 9.369249069186397e-06} {"step": 14170, "timestamp": 1778341028.158705, "train/loss": 2.358825922012329, "train/z_loss": 0.001359682739712298, "train/perplexity": 10.578524146205876, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026868.4943809952, "perf/iters_per_sec": 0.9664862129120804, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034675908088684, "data/tokens_consumed": 29718740992, "data/tokens_consumed_B": 29.718740992, "train/loss_slope": 1.195088527312051e-05} {"step": 14175, "timestamp": 1778341033.9275272, "eos/sharpness": 53.97329330444335, "eos/L0_probe": 2.305224657058716, "eos/L_plus": 2.5542359352111816, "eos/L_minus": 2.5959463119506836, "eos/grad_norm": 0.15306448936462402, "eos/embed_grad_frac": 0.09493447095155716, "eos/time_s": 0.5905253887176514} {"step": 14175, "timestamp": 1778341035.306512, "geo/rankme_last": 430.9310302734375, "geo/layer_0/stable_rank_q_proj": 20.763526916503906, "geo/layer_0/stable_rank_k_proj": 16.83176612854004, "geo/layer_0/stable_rank_o_proj": 43.79983901977539, "geo/layer_0/stable_rank_gate_proj": 124.28788757324219, "geo/layer_0/stable_rank_down_proj": 57.9205436706543, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06269478052854538, "geo/layer_0/attn_entropy_mean": 6.23032808303833, "geo/layer_0/attn_entropy_std": 0.4579162001609802, "geo/layer_7/stable_rank_q_proj": 41.690086364746094, "geo/layer_7/stable_rank_k_proj": 38.665958404541016, "geo/layer_7/stable_rank_o_proj": 88.40711975097656, "geo/layer_7/stable_rank_gate_proj": 77.99933624267578, "geo/layer_7/stable_rank_down_proj": 144.6381378173828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3979305922985077, "geo/layer_7/attn_entropy_mean": 4.723631858825684, "geo/layer_7/attn_entropy_std": 0.7711942195892334, "geo/layer_14/stable_rank_q_proj": 51.54023742675781, "geo/layer_14/stable_rank_k_proj": 43.844730377197266, "geo/layer_14/stable_rank_o_proj": 42.40425491333008, "geo/layer_14/stable_rank_gate_proj": 71.82880401611328, "geo/layer_14/stable_rank_down_proj": 126.83077239990234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37564852833747864, "geo/layer_14/attn_entropy_mean": 5.496610641479492, "geo/layer_14/attn_entropy_std": 0.4870702922344208, "geo/layer_21/stable_rank_q_proj": 38.23847198486328, "geo/layer_21/stable_rank_k_proj": 28.494279861450195, "geo/layer_21/stable_rank_o_proj": 64.91160583496094, "geo/layer_21/stable_rank_gate_proj": 59.736106872558594, "geo/layer_21/stable_rank_down_proj": 48.691017150878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13803942501544952, "geo/layer_21/attn_entropy_mean": 5.84458065032959, "geo/layer_21/attn_entropy_std": 0.32363590598106384, "geo/layer_27/stable_rank_q_proj": 45.095211029052734, "geo/layer_27/stable_rank_k_proj": 30.614730834960938, "geo/layer_27/stable_rank_o_proj": 106.68507385253906, "geo/layer_27/stable_rank_gate_proj": 69.4454574584961, "geo/layer_27/stable_rank_down_proj": 130.1409912109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10114254802465439, "geo/layer_27/attn_entropy_mean": 4.327868461608887, "geo/layer_27/attn_entropy_std": 0.695757269859314, "attnres/final_alpha/block_0": 0.2638302445411682, "attnres/block_norm/0": 1.783268928527832, "attnres/final_alpha/block_1": 0.003797534853219986, "attnres/block_norm/1": 50756.53125, "attnres/final_alpha/block_2": 0.00815727561712265, "attnres/block_norm/2": 30056.90234375, "attnres/final_alpha/block_3": 0.010502122342586517, "attnres/block_norm/3": 72952.40625, "attnres/final_alpha/block_4": 0.011931437067687511, "attnres/block_norm/4": 17730.73046875, "attnres/final_alpha/block_5": 0.6026514768600464, "attnres/block_norm/5": 7215.56884765625, "attnres/final_alpha/block_6": 0.09912990033626556, "attnres/block_norm/6": 49447.1953125, "geo/tier1_time_s": 1.3592312335968018, "geo/step": 14175.0, "geo/rankme_slope": 0.00015831639296343537} {"step": 14180, "timestamp": 1778341040.4810164, "train/loss": 2.311254954338074, "train/z_loss": 0.0013722778880037368, "train/perplexity": 10.087075534415654, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702806.6960264142, "perf/iters_per_sec": 0.8119615059024878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.231585478782654, "data/tokens_consumed": 29739712512, "data/tokens_consumed_B": 29.739712512, "train/loss_slope": 1.1969613499588477e-05} {"step": 14190, "timestamp": 1778341050.8352785, "train/loss": 2.3102526664733887, "train/z_loss": 0.001363153092097491, "train/perplexity": 10.076970445967056, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026522.5644215287, "perf/iters_per_sec": 0.9663212606532711, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348525285720824, "data/tokens_consumed": 29760684032, "data/tokens_consumed_B": 29.760684032, "train/loss_slope": 8.007485867261945e-06} {"step": 14200, "timestamp": 1778341061.1697102, "grad/layer_0/attn": 0.002889792202040553, "grad/layer_0/mlp": 0.0031974432058632374, "grad/layer_0/attn_mlp_ratio": 0.9037821551805043, "grad/layer_4/attn": 0.0019202560652047396, "grad/layer_4/mlp": 0.002532901242375374, "grad/layer_4/attn_mlp_ratio": 0.7581250927854573, "grad/layer_8/attn": 0.003544202074408531, "grad/layer_8/mlp": 0.0033847771119326353, "grad/layer_8/attn_mlp_ratio": 1.0471005482765194, "grad/layer_12/attn": 0.006573611870408058, "grad/layer_12/mlp": 0.006456475239247084, "grad/layer_12/attn_mlp_ratio": 1.0181424887428803, "grad/layer_16/attn": 0.0036367245484143496, "grad/layer_16/mlp": 0.004280758555978537, "grad/layer_16/attn_mlp_ratio": 0.8495514091492348, "grad/layer_20/attn": 0.002781208837404847, "grad/layer_20/mlp": 0.0054926881566643715, "grad/layer_20/attn_mlp_ratio": 0.5063474763983482, "grad/layer_24/attn": 0.009650826454162598, "grad/layer_24/mlp": 0.008700216189026833, "grad/layer_24/attn_mlp_ratio": 1.1092628198605505, "grad/layer_27/attn": 0.007905244827270508, "grad/layer_27/mlp": 0.008059530518949032, "grad/layer_27/attn_mlp_ratio": 0.980856727398519} {"step": 14200, "timestamp": 1778341061.1836681, "train/loss": 2.285782051086426, "train/z_loss": 0.001368385716341436, "train/perplexity": 9.83337341810119, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027854.0238593544, "perf/iters_per_sec": 0.9669561499878666, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034173059463501, "data/tokens_consumed": 29781655552, "data/tokens_consumed_B": 29.781655552, "train/loss_slope": 6.16275784206075e-06} {"step": 14210, "timestamp": 1778341071.54666, "train/loss": 2.3386759996414184, "train/z_loss": 0.0013710333267226815, "train/perplexity": 10.367500896820088, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025147.8576466884, "perf/iters_per_sec": 0.9656657493813936, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355550050735474, "data/tokens_consumed": 29802627072, "data/tokens_consumed_B": 29.802627072, "train/loss_slope": 4.8082712352579725e-06} {"step": 14220, "timestamp": 1778341081.8984094, "train/loss": 2.313461756706238, "train/z_loss": 0.0013674711692146957, "train/perplexity": 10.109360296581958, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027224.5874685382, "perf/iters_per_sec": 0.9666560113280002, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034494161605835, "data/tokens_consumed": 29823598592, "data/tokens_consumed_B": 29.823598592, "train/loss_slope": 2.62797991148602e-06} {"step": 14230, "timestamp": 1778341092.2487373, "train/loss": 2.3143749237060547, "train/z_loss": 0.0013609311077743768, "train/perplexity": 10.118596047043537, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027567.0649343154, "perf/iters_per_sec": 0.96681931730953, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343194246292113, "data/tokens_consumed": 29844570112, "data/tokens_consumed_B": 29.844570112, "train/loss_slope": 1.1270556405539811e-06} {"step": 14240, "timestamp": 1778341102.6144037, "train/loss": 2.3485347509384153, "train/z_loss": 0.001364823617041111, "train/perplexity": 10.470217004054815, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024749.0082916003, "perf/iters_per_sec": 0.9654755631883623, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357589960098266, "data/tokens_consumed": 29865541632, "data/tokens_consumed_B": 29.865541632, "train/loss_slope": -9.033505422304451e-07} {"step": 14250, "timestamp": 1778341112.9542542, "grad/layer_0/attn": 0.002933194860816002, "grad/layer_0/mlp": 0.003372756764292717, "grad/layer_0/attn_mlp_ratio": 0.8696727866362542, "grad/layer_4/attn": 0.0026542816776782274, "grad/layer_4/mlp": 0.0026923788245767355, "grad/layer_4/attn_mlp_ratio": 0.985849968386416, "grad/layer_8/attn": 0.0033621874172240496, "grad/layer_8/mlp": 0.0032975648064166307, "grad/layer_8/attn_mlp_ratio": 1.0195970398282903, "grad/layer_12/attn": 0.012485401704907417, "grad/layer_12/mlp": 0.007325019687414169, "grad/layer_12/attn_mlp_ratio": 1.7044870959065264, "grad/layer_16/attn": 0.004368255380541086, "grad/layer_16/mlp": 0.004428818356245756, "grad/layer_16/attn_mlp_ratio": 0.9863252295611123, "grad/layer_20/attn": 0.0034305015578866005, "grad/layer_20/mlp": 0.006326119881123304, "grad/layer_20/attn_mlp_ratio": 0.5422757658917275, "grad/layer_24/attn": 0.011355261318385601, "grad/layer_24/mlp": 0.011870547197759151, "grad/layer_24/attn_mlp_ratio": 0.9565912197265898, "grad/layer_27/attn": 0.006259751971811056, "grad/layer_27/mlp": 0.012354467995464802, "grad/layer_27/attn_mlp_ratio": 0.5066791968250699} {"step": 14250, "timestamp": 1778341113.5388286, "eos/sharpness": 60.101294517517076, "eos/L0_probe": 2.3067448139190674, "eos/L_plus": 2.593210220336914, "eos/L_minus": 2.6212923526763916, "eos/grad_norm": 0.19208186864852905, "eos/embed_grad_frac": 0.06938858330249786, "eos/time_s": 0.5817909240722656} {"step": 14250, "timestamp": 1778341113.5580664, "train/loss": 2.3343019247055055, "train/z_loss": 0.0013627370353788137, "train/perplexity": 10.322251704821825, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917499.73196663, "perf/iters_per_sec": 0.9143351230462218, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0936908960342406, "data/tokens_consumed": 29886513152, "data/tokens_consumed_B": 29.886513152, "train/loss_slope": 2.77269147660596e-07} {"step": 14250, "timestamp": 1778341114.9168591, "geo/rankme_last": 430.5356140136719, "geo/layer_0/stable_rank_q_proj": 20.718402862548828, "geo/layer_0/stable_rank_k_proj": 16.80568504333496, "geo/layer_0/stable_rank_o_proj": 43.72846984863281, "geo/layer_0/stable_rank_gate_proj": 124.23861694335938, "geo/layer_0/stable_rank_down_proj": 57.863121032714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06111550331115723, "geo/layer_0/attn_entropy_mean": 6.230045795440674, "geo/layer_0/attn_entropy_std": 0.4577256441116333, "geo/layer_7/stable_rank_q_proj": 41.651371002197266, "geo/layer_7/stable_rank_k_proj": 38.70524978637695, "geo/layer_7/stable_rank_o_proj": 88.45015716552734, "geo/layer_7/stable_rank_gate_proj": 77.9194564819336, "geo/layer_7/stable_rank_down_proj": 144.453857421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3899479806423187, "geo/layer_7/attn_entropy_mean": 4.721813201904297, "geo/layer_7/attn_entropy_std": 0.7754963040351868, "geo/layer_14/stable_rank_q_proj": 51.54754638671875, "geo/layer_14/stable_rank_k_proj": 43.786293029785156, "geo/layer_14/stable_rank_o_proj": 42.36540603637695, "geo/layer_14/stable_rank_gate_proj": 71.83768463134766, "geo/layer_14/stable_rank_down_proj": 127.19891357421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37696585059165955, "geo/layer_14/attn_entropy_mean": 5.5335283279418945, "geo/layer_14/attn_entropy_std": 0.4807533025741577, "geo/layer_21/stable_rank_q_proj": 38.251953125, "geo/layer_21/stable_rank_k_proj": 28.444913864135742, "geo/layer_21/stable_rank_o_proj": 64.83631134033203, "geo/layer_21/stable_rank_gate_proj": 59.75849914550781, "geo/layer_21/stable_rank_down_proj": 48.67308044433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13434195518493652, "geo/layer_21/attn_entropy_mean": 5.859712600708008, "geo/layer_21/attn_entropy_std": 0.32441624999046326, "geo/layer_27/stable_rank_q_proj": 45.09310531616211, "geo/layer_27/stable_rank_k_proj": 30.677570343017578, "geo/layer_27/stable_rank_o_proj": 106.7532958984375, "geo/layer_27/stable_rank_gate_proj": 69.41030883789062, "geo/layer_27/stable_rank_down_proj": 130.4546356201172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09966985881328583, "geo/layer_27/attn_entropy_mean": 4.304835319519043, "geo/layer_27/attn_entropy_std": 0.7054723501205444, "attnres/final_alpha/block_0": 0.2641240358352661, "attnres/block_norm/0": 1.783282995223999, "attnres/final_alpha/block_1": 0.0038331449031829834, "attnres/block_norm/1": 50826.72265625, "attnres/final_alpha/block_2": 0.008104650303721428, "attnres/block_norm/2": 30099.4375, "attnres/final_alpha/block_3": 0.010362980887293816, "attnres/block_norm/3": 73621.046875, "attnres/final_alpha/block_4": 0.011830674484372139, "attnres/block_norm/4": 17674.6484375, "attnres/final_alpha/block_5": 0.6022017002105713, "attnres/block_norm/5": 7226.59619140625, "attnres/final_alpha/block_6": 0.09954286366701126, "attnres/block_norm/6": 49199.6796875, "geo/tier1_time_s": 1.3554487228393555, "geo/step": 14250.0, "geo/rankme_slope": 0.00017789264143157262} {"step": 14260, "timestamp": 1778341125.285374, "train/loss": 2.2896914005279543, "train/z_loss": 0.001378014893271029, "train/perplexity": 9.871890750783024, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788749.1354690236, "perf/iters_per_sec": 0.8529420544953459, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1724125862121582, "data/tokens_consumed": 29907484672, "data/tokens_consumed_B": 29.907484672, "train/loss_slope": -1.0591977929482597e-06} {"step": 14270, "timestamp": 1778341135.6285415, "train/loss": 2.3172899007797243, "train/z_loss": 0.0013776606298051774, "train/perplexity": 10.148134553657231, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028827.73704973, "perf/iters_per_sec": 0.9674204526184702, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336767196655274, "data/tokens_consumed": 29928456192, "data/tokens_consumed_B": 29.928456192, "train/loss_slope": -1.940985052141388e-06} {"step": 14280, "timestamp": 1778341145.9679818, "train/loss": 2.3791109323501587, "train/z_loss": 0.0013643412617966532, "train/perplexity": 10.795300843390663, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029220.4708825976, "perf/iters_per_sec": 0.967607722703265, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0334766626358032, "data/tokens_consumed": 29949427712, "data/tokens_consumed_B": 29.949427712, "train/loss_slope": -1.2513468582423485e-06} {"step": 14290, "timestamp": 1778341156.3135755, "train/loss": 2.340166711807251, "train/z_loss": 0.0013695144676603376, "train/perplexity": 10.382967381710689, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028545.742083007, "perf/iters_per_sec": 0.9672859869399104, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0338204145431518, "data/tokens_consumed": 29970399232, "data/tokens_consumed_B": 29.970399232, "train/loss_slope": -1.111514611963822e-06} {"step": 14300, "timestamp": 1778341166.6622646, "grad/layer_0/attn": 0.0032929000444710255, "grad/layer_0/mlp": 0.003678346984088421, "grad/layer_0/attn_mlp_ratio": 0.8952118897956259, "grad/layer_4/attn": 0.002810745732858777, "grad/layer_4/mlp": 0.002751863794401288, "grad/layer_4/attn_mlp_ratio": 1.0213970751159913, "grad/layer_8/attn": 0.004653111565858126, "grad/layer_8/mlp": 0.0036958695854991674, "grad/layer_8/attn_mlp_ratio": 1.2590031472469712, "grad/layer_12/attn": 0.010724305175244808, "grad/layer_12/mlp": 0.008068373426795006, "grad/layer_12/attn_mlp_ratio": 1.3291780728333256, "grad/layer_16/attn": 0.00391271710395813, "grad/layer_16/mlp": 0.00507911155000329, "grad/layer_16/attn_mlp_ratio": 0.77035461584224, "grad/layer_20/attn": 0.0035657070111483335, "grad/layer_20/mlp": 0.007421530317515135, "grad/layer_20/attn_mlp_ratio": 0.48045440907081777, "grad/layer_24/attn": 0.016193902119994164, "grad/layer_24/mlp": 0.01431929413229227, "grad/layer_24/attn_mlp_ratio": 1.1309148242428286, "grad/layer_27/attn": 0.007505306974053383, "grad/layer_27/mlp": 0.013372362591326237, "grad/layer_27/attn_mlp_ratio": 0.5612551160403073} {"step": 14300, "timestamp": 1778341166.676397, "train/loss": 2.3192046642303468, "train/z_loss": 0.0013710150262340904, "train/perplexity": 10.167584445821431, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025127.2493779005, "perf/iters_per_sec": 0.9656559225930693, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355655431747437, "data/tokens_consumed": 29991370752, "data/tokens_consumed_B": 29.991370752, "train/loss_slope": -1.2327798093148402e-06} {"step": 14310, "timestamp": 1778341177.041464, "train/loss": 2.280599045753479, "train/z_loss": 0.0013776798848994076, "train/perplexity": 9.782538842970997, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024304.2873229745, "perf/iters_per_sec": 0.9652635037054894, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359865427017212, "data/tokens_consumed": 30012342272, "data/tokens_consumed_B": 30.012342272, "train/loss_slope": -2.9922329267152098e-06} {"step": 14320, "timestamp": 1778341187.3852878, "train/loss": 2.3591343641281126, "train/z_loss": 0.0013561421423219145, "train/perplexity": 10.581787511829225, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028866.9053184928, "perf/iters_per_sec": 0.9674391295044388, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336567640304566, "data/tokens_consumed": 30033313792, "data/tokens_consumed_B": 30.033313792, "train/loss_slope": -2.1436401004373484e-06} {"step": 14325, "timestamp": 1778341193.1381915, "eos/sharpness": 29.640793800353997, "eos/L0_probe": 2.3066813945770264, "eos/L_plus": 2.45516300201416, "eos/L_minus": 2.4546077251434326, "eos/grad_norm": 0.12139920145273209, "eos/embed_grad_frac": 0.15979284048080444, "eos/time_s": 0.5913968086242676} {"step": 14325, "timestamp": 1778341194.518364, "geo/rankme_last": 430.1040954589844, "geo/layer_0/stable_rank_q_proj": 20.687204360961914, "geo/layer_0/stable_rank_k_proj": 16.83740997314453, "geo/layer_0/stable_rank_o_proj": 43.69115447998047, "geo/layer_0/stable_rank_gate_proj": 124.21697235107422, "geo/layer_0/stable_rank_down_proj": 57.87178039550781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06707393378019333, "geo/layer_0/attn_entropy_mean": 6.231301307678223, "geo/layer_0/attn_entropy_std": 0.4596919119358063, "geo/layer_7/stable_rank_q_proj": 41.685916900634766, "geo/layer_7/stable_rank_k_proj": 38.6430778503418, "geo/layer_7/stable_rank_o_proj": 88.42613983154297, "geo/layer_7/stable_rank_gate_proj": 77.9023208618164, "geo/layer_7/stable_rank_down_proj": 144.7901153564453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39031779766082764, "geo/layer_7/attn_entropy_mean": 4.758990287780762, "geo/layer_7/attn_entropy_std": 0.7686439156532288, "geo/layer_14/stable_rank_q_proj": 51.61800765991211, "geo/layer_14/stable_rank_k_proj": 43.91997528076172, "geo/layer_14/stable_rank_o_proj": 42.384788513183594, "geo/layer_14/stable_rank_gate_proj": 71.85738372802734, "geo/layer_14/stable_rank_down_proj": 127.54314422607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3698342442512512, "geo/layer_14/attn_entropy_mean": 5.526197910308838, "geo/layer_14/attn_entropy_std": 0.4589100778102875, "geo/layer_21/stable_rank_q_proj": 38.16059875488281, "geo/layer_21/stable_rank_k_proj": 28.486324310302734, "geo/layer_21/stable_rank_o_proj": 64.8363265991211, "geo/layer_21/stable_rank_gate_proj": 59.73685073852539, "geo/layer_21/stable_rank_down_proj": 48.69932174682617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1377103626728058, "geo/layer_21/attn_entropy_mean": 5.841914176940918, "geo/layer_21/attn_entropy_std": 0.32039791345596313, "geo/layer_27/stable_rank_q_proj": 45.13986587524414, "geo/layer_27/stable_rank_k_proj": 30.645435333251953, "geo/layer_27/stable_rank_o_proj": 106.78071594238281, "geo/layer_27/stable_rank_gate_proj": 69.30792999267578, "geo/layer_27/stable_rank_down_proj": 130.3067626953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1044003963470459, "geo/layer_27/attn_entropy_mean": 4.301375389099121, "geo/layer_27/attn_entropy_std": 0.6698052287101746, "attnres/final_alpha/block_0": 0.26312074065208435, "attnres/block_norm/0": 1.7831679582595825, "attnres/final_alpha/block_1": 0.0037972470745444298, "attnres/block_norm/1": 50795.12890625, "attnres/final_alpha/block_2": 0.008035881444811821, "attnres/block_norm/2": 30057.4609375, "attnres/final_alpha/block_3": 0.010433057323098183, "attnres/block_norm/3": 73785.71875, "attnres/final_alpha/block_4": 0.011757556349039078, "attnres/block_norm/4": 17720.3671875, "attnres/final_alpha/block_5": 0.6036167144775391, "attnres/block_norm/5": 7272.61376953125, "attnres/final_alpha/block_6": 0.0992387980222702, "attnres/block_norm/6": 49489.2734375, "geo/tier1_time_s": 1.3621761798858643, "geo/step": 14325.0, "geo/rankme_slope": 0.0001713226306147459} {"step": 14330, "timestamp": 1778341199.6926272, "train/loss": 2.3285434007644654, "train/z_loss": 0.0013648487394675613, "train/perplexity": 10.262981589229929, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704889.6695900226, "perf/iters_per_sec": 0.8129547450971711, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.230080771446228, "data/tokens_consumed": 30054285312, "data/tokens_consumed_B": 30.054285312, "train/loss_slope": -3.065229857107407e-06} {"step": 14340, "timestamp": 1778341210.0499341, "train/loss": 2.3622271060943603, "train/z_loss": 0.0013595271506346763, "train/perplexity": 10.614564910036005, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025521.9099944585, "perf/iters_per_sec": 0.9658441114399235, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03536376953125, "data/tokens_consumed": 30075256832, "data/tokens_consumed_B": 30.075256832, "train/loss_slope": -6.537377351473847e-08} {"step": 14350, "timestamp": 1778341220.3928618, "grad/layer_0/attn": 0.0029822071082890034, "grad/layer_0/mlp": 0.0034078555181622505, "grad/layer_0/attn_mlp_ratio": 0.87509784522421, "grad/layer_4/attn": 0.0021978935692459345, "grad/layer_4/mlp": 0.0026651041116565466, "grad/layer_4/attn_mlp_ratio": 0.8246932932801871, "grad/layer_8/attn": 0.004471235908567905, "grad/layer_8/mlp": 0.003353426232933998, "grad/layer_8/attn_mlp_ratio": 1.3333335712956924, "grad/layer_12/attn": 0.007094794884324074, "grad/layer_12/mlp": 0.007248049136251211, "grad/layer_12/attn_mlp_ratio": 0.9788557794060457, "grad/layer_16/attn": 0.004109254106879234, "grad/layer_16/mlp": 0.0048384820111095905, "grad/layer_16/attn_mlp_ratio": 0.8492857909806089, "grad/layer_20/attn": 0.005845557898283005, "grad/layer_20/mlp": 0.006740067154169083, "grad/layer_20/attn_mlp_ratio": 0.8672848026356448, "grad/layer_24/attn": 0.012208227068185806, "grad/layer_24/mlp": 0.014069214463233948, "grad/layer_24/attn_mlp_ratio": 0.8677262695309712, "grad/layer_27/attn": 0.010012920014560223, "grad/layer_27/mlp": 0.014364291913807392, "grad/layer_27/attn_mlp_ratio": 0.6970702074933802} {"step": 14350, "timestamp": 1778341220.407068, "train/loss": 2.3126813411712646, "train/z_loss": 0.0013627514825202524, "train/perplexity": 10.101473872502064, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026059.56363432, "perf/iters_per_sec": 0.9661004846736526, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350890159606934, "data/tokens_consumed": 30096228352, "data/tokens_consumed_B": 30.096228352, "train/loss_slope": 1.6666315641269565e-07} {"step": 14360, "timestamp": 1778341230.7632556, "train/loss": 2.3401083946228027, "train/z_loss": 0.001366443419829011, "train/perplexity": 10.382361893942113, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026280.3249957096, "perf/iters_per_sec": 0.9662057518938587, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349762439727783, "data/tokens_consumed": 30117199872, "data/tokens_consumed_B": 30.117199872, "train/loss_slope": -2.0779187923217163e-06} {"step": 14370, "timestamp": 1778341241.1228983, "train/loss": 2.350447750091553, "train/z_loss": 0.0013698402093723417, "train/perplexity": 10.490265690762962, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025221.715159078, "perf/iters_per_sec": 0.9657009673877134, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355172395706176, "data/tokens_consumed": 30138171392, "data/tokens_consumed_B": 30.138171392, "train/loss_slope": 1.365562076150883e-06} {"step": 14380, "timestamp": 1778341251.4689429, "train/loss": 2.3052705049514772, "train/z_loss": 0.0013680020696483553, "train/perplexity": 10.026890209059035, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028145.0389224032, "perf/iters_per_sec": 0.9670949167835251, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340246677398681, "data/tokens_consumed": 30159142912, "data/tokens_consumed_B": 30.159142912, "train/loss_slope": 1.0981482211942262e-06} {"step": 14390, "timestamp": 1778341261.813137, "train/loss": 2.3048120021820067, "train/z_loss": 0.0013566314009949565, "train/perplexity": 10.022293905918385, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028728.115138467, "perf/iters_per_sec": 0.9673729491894087, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.033727478981018, "data/tokens_consumed": 30180114432, "data/tokens_consumed_B": 30.180114432, "train/loss_slope": -1.3497675355761682e-06} {"step": 14400, "timestamp": 1778341272.1508358, "grad/layer_0/attn": 0.002967948094010353, "grad/layer_0/mlp": 0.003408895106986165, "grad/layer_0/attn_mlp_ratio": 0.8706480879576063, "grad/layer_4/attn": 0.003015493042767048, "grad/layer_4/mlp": 0.002626455156132579, "grad/layer_4/attn_mlp_ratio": 1.1481227543191892, "grad/layer_8/attn": 0.0037892593536525965, "grad/layer_8/mlp": 0.0033792501781135798, "grad/layer_8/attn_mlp_ratio": 1.1213313728771515, "grad/layer_12/attn": 0.004385187290608883, "grad/layer_12/mlp": 0.006715394556522369, "grad/layer_12/attn_mlp_ratio": 0.6530051493473644, "grad/layer_16/attn": 0.003909474238753319, "grad/layer_16/mlp": 0.0046723149716854095, "grad/layer_16/attn_mlp_ratio": 0.8367317226625052, "grad/layer_20/attn": 0.0027757473289966583, "grad/layer_20/mlp": 0.005420743953436613, "grad/layer_20/attn_mlp_ratio": 0.5120602082728672, "grad/layer_24/attn": 0.0056498050689697266, "grad/layer_24/mlp": 0.00856123585253954, "grad/layer_24/attn_mlp_ratio": 0.6599286715481556, "grad/layer_27/attn": 0.0039165993221104145, "grad/layer_27/mlp": 0.007279953919351101, "grad/layer_27/attn_mlp_ratio": 0.5379978103844564} {"step": 14400, "timestamp": 1778341272.7308104, "eos/sharpness": 20.785498619079586, "eos/L0_probe": 2.3072702884674072, "eos/L_plus": 2.410630226135254, "eos/L_minus": 2.4117653369903564, "eos/grad_norm": 0.09829207509756088, "eos/embed_grad_frac": 0.25582727789878845, "eos/time_s": 0.577256441116333} {"step": 14400, "timestamp": 1778341272.749059, "train/loss": 2.271697163581848, "train/z_loss": 0.0013833988341502845, "train/perplexity": 9.695842288618058, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918653.5326393957, "perf/iters_per_sec": 0.9148852980801562, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0930331945419312, "data/tokens_consumed": 30201085952, "data/tokens_consumed_B": 30.201085952, "train/loss_slope": -5.797341003669736e-06} {"step": 14400, "timestamp": 1778341274.1095657, "geo/rankme_last": 428.733642578125, "geo/layer_0/stable_rank_q_proj": 20.667354583740234, "geo/layer_0/stable_rank_k_proj": 16.82845687866211, "geo/layer_0/stable_rank_o_proj": 43.7867431640625, "geo/layer_0/stable_rank_gate_proj": 124.25193786621094, "geo/layer_0/stable_rank_down_proj": 57.87366485595703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.064764104783535, "geo/layer_0/attn_entropy_mean": 6.234481334686279, "geo/layer_0/attn_entropy_std": 0.4603518843650818, "geo/layer_7/stable_rank_q_proj": 41.710330963134766, "geo/layer_7/stable_rank_k_proj": 38.635677337646484, "geo/layer_7/stable_rank_o_proj": 88.47528076171875, "geo/layer_7/stable_rank_gate_proj": 77.91036987304688, "geo/layer_7/stable_rank_down_proj": 144.87319946289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39771145582199097, "geo/layer_7/attn_entropy_mean": 4.760551452636719, "geo/layer_7/attn_entropy_std": 0.7823507785797119, "geo/layer_14/stable_rank_q_proj": 51.65561294555664, "geo/layer_14/stable_rank_k_proj": 43.96879959106445, "geo/layer_14/stable_rank_o_proj": 42.37305450439453, "geo/layer_14/stable_rank_gate_proj": 71.93254852294922, "geo/layer_14/stable_rank_down_proj": 127.59414672851562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3672279417514801, "geo/layer_14/attn_entropy_mean": 5.545959949493408, "geo/layer_14/attn_entropy_std": 0.48162832856178284, "geo/layer_21/stable_rank_q_proj": 38.188236236572266, "geo/layer_21/stable_rank_k_proj": 28.576120376586914, "geo/layer_21/stable_rank_o_proj": 64.81597900390625, "geo/layer_21/stable_rank_gate_proj": 59.751914978027344, "geo/layer_21/stable_rank_down_proj": 48.72105026245117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13629348576068878, "geo/layer_21/attn_entropy_mean": 5.868520736694336, "geo/layer_21/attn_entropy_std": 0.33060717582702637, "geo/layer_27/stable_rank_q_proj": 45.254493713378906, "geo/layer_27/stable_rank_k_proj": 30.61729621887207, "geo/layer_27/stable_rank_o_proj": 106.87461853027344, "geo/layer_27/stable_rank_gate_proj": 69.30657958984375, "geo/layer_27/stable_rank_down_proj": 130.36740112304688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10271038115024567, "geo/layer_27/attn_entropy_mean": 4.318534851074219, "geo/layer_27/attn_entropy_std": 0.6666654944419861, "attnres/final_alpha/block_0": 0.2643280625343323, "attnres/block_norm/0": 1.7832350730895996, "attnres/final_alpha/block_1": 0.003861234523355961, "attnres/block_norm/1": 50850.9765625, "attnres/final_alpha/block_2": 0.008218759670853615, "attnres/block_norm/2": 30031.998046875, "attnres/final_alpha/block_3": 0.010616286657750607, "attnres/block_norm/3": 73116.96875, "attnres/final_alpha/block_4": 0.011622752994298935, "attnres/block_norm/4": 17827.150390625, "attnres/final_alpha/block_5": 0.6008992195129395, "attnres/block_norm/5": 7367.2333984375, "attnres/final_alpha/block_6": 0.10045371949672699, "attnres/block_norm/6": 49656.76953125, "geo/tier1_time_s": 1.356839656829834, "geo/step": 14400.0, "geo/rankme_slope": 0.00013329151973289315} {"step": 14410, "timestamp": 1778341284.9938323, "train/loss": 2.3145847082138062, "train/z_loss": 0.0013737450004555284, "train/perplexity": 10.12071899440735, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1713169.8328177389, "perf/iters_per_sec": 0.8169030346001334, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2241354942321778, "data/tokens_consumed": 30222057472, "data/tokens_consumed_B": 30.222057472, "train/loss_slope": -5.969746921858607e-06} {"step": 14420, "timestamp": 1778341295.3348415, "train/loss": 2.3252289295196533, "train/z_loss": 0.0013721322990022599, "train/perplexity": 10.22902154275477, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029383.1599173471, "perf/iters_per_sec": 0.9676852988802658, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0333938121795654, "data/tokens_consumed": 30243028992, "data/tokens_consumed_B": 30.243028992, "train/loss_slope": -3.7510416080670443e-06} {"step": 14430, "timestamp": 1778341305.6915689, "train/loss": 2.379258394241333, "train/z_loss": 0.0013717678841203452, "train/perplexity": 10.796892856246554, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.0451827645, "perf/iters_per_sec": 0.9662304140008757, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034949827194214, "data/tokens_consumed": 30264000512, "data/tokens_consumed_B": 30.264000512, "train/loss_slope": -1.1453872895834336e-06} {"step": 14440, "timestamp": 1778341316.036465, "train/loss": 2.29852979183197, "train/z_loss": 0.0013691041967831552, "train/perplexity": 9.95952910467721, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028381.4098476092, "perf/iters_per_sec": 0.9672076272237822, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0339041709899903, "data/tokens_consumed": 30284972032, "data/tokens_consumed_B": 30.284972032, "train/loss_slope": -1.7007603289091763e-06} {"step": 14450, "timestamp": 1778341326.365276, "grad/layer_0/attn": 0.00283247884362936, "grad/layer_0/mlp": 0.003082340583205223, "grad/layer_0/attn_mlp_ratio": 0.9189376304386834, "grad/layer_4/attn": 0.0017814717721194029, "grad/layer_4/mlp": 0.0025507877580821514, "grad/layer_4/attn_mlp_ratio": 0.6984006006124043, "grad/layer_8/attn": 0.0028052986599504948, "grad/layer_8/mlp": 0.003497905796393752, "grad/layer_8/attn_mlp_ratio": 0.8019937479858115, "grad/layer_12/attn": 0.00524686137214303, "grad/layer_12/mlp": 0.006823072209954262, "grad/layer_12/attn_mlp_ratio": 0.7689880941886431, "grad/layer_16/attn": 0.00451207160949707, "grad/layer_16/mlp": 0.00445159338414669, "grad/layer_16/attn_mlp_ratio": 1.0135857251040012, "grad/layer_20/attn": 0.0029157153330743313, "grad/layer_20/mlp": 0.0056339106522500515, "grad/layer_20/attn_mlp_ratio": 0.5175295565180658, "grad/layer_24/attn": 0.005910541396588087, "grad/layer_24/mlp": 0.008653409779071808, "grad/layer_24/attn_mlp_ratio": 0.6830303289900409, "grad/layer_27/attn": 0.008602584712207317, "grad/layer_27/mlp": 0.007378540467470884, "grad/layer_27/attn_mlp_ratio": 1.1658924462830451} {"step": 14450, "timestamp": 1778341326.379498, "train/loss": 2.3593668699264527, "train/z_loss": 0.0013573237229138613, "train/perplexity": 10.584248124824837, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028643.8021235266, "perf/iters_per_sec": 0.967332745610965, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337704420089722, "data/tokens_consumed": 30305943552, "data/tokens_consumed_B": 30.305943552, "train/loss_slope": 1.3977200141583943e-06} {"step": 14460, "timestamp": 1778341336.7235456, "train/loss": 2.375177240371704, "train/z_loss": 0.0013612785027362407, "train/perplexity": 10.752918868526596, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028934.7163269713, "perf/iters_per_sec": 0.9674714643130166, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0336222171783447, "data/tokens_consumed": 30326915072, "data/tokens_consumed_B": 30.326915072, "train/loss_slope": 4.105942906683847e-06} {"step": 14470, "timestamp": 1778341347.072632, "train/loss": 2.3089656829833984, "train/z_loss": 0.0013776027713902294, "train/perplexity": 10.06400989317162, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027349.2005476367, "perf/iters_per_sec": 0.9667154314745124, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344305753707885, "data/tokens_consumed": 30347886592, "data/tokens_consumed_B": 30.347886592, "train/loss_slope": 1.4587645125825348e-06} {"step": 14475, "timestamp": 1778341352.8294067, "eos/sharpness": 9.81311798095703, "eos/L0_probe": 2.3034911155700684, "eos/L_plus": 2.3616886138916016, "eos/L_minus": 2.3434247970581055, "eos/grad_norm": 0.09368957579135895, "eos/embed_grad_frac": 0.2783430814743042, "eos/time_s": 0.5937683582305908} {"step": 14475, "timestamp": 1778341354.2047274, "geo/rankme_last": 429.66729736328125, "geo/layer_0/stable_rank_q_proj": 20.65503692626953, "geo/layer_0/stable_rank_k_proj": 16.800416946411133, "geo/layer_0/stable_rank_o_proj": 43.77595520019531, "geo/layer_0/stable_rank_gate_proj": 124.44059753417969, "geo/layer_0/stable_rank_down_proj": 57.95302963256836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0659145787358284, "geo/layer_0/attn_entropy_mean": 6.230251312255859, "geo/layer_0/attn_entropy_std": 0.45728790760040283, "geo/layer_7/stable_rank_q_proj": 41.77831268310547, "geo/layer_7/stable_rank_k_proj": 38.64750671386719, "geo/layer_7/stable_rank_o_proj": 88.35163116455078, "geo/layer_7/stable_rank_gate_proj": 77.82974243164062, "geo/layer_7/stable_rank_down_proj": 144.8430633544922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39493387937545776, "geo/layer_7/attn_entropy_mean": 4.766950607299805, "geo/layer_7/attn_entropy_std": 0.7721881866455078, "geo/layer_14/stable_rank_q_proj": 51.63619613647461, "geo/layer_14/stable_rank_k_proj": 43.9207763671875, "geo/layer_14/stable_rank_o_proj": 42.34284973144531, "geo/layer_14/stable_rank_gate_proj": 71.92192840576172, "geo/layer_14/stable_rank_down_proj": 127.39787292480469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3801511228084564, "geo/layer_14/attn_entropy_mean": 5.539875030517578, "geo/layer_14/attn_entropy_std": 0.4705805778503418, "geo/layer_21/stable_rank_q_proj": 38.21940231323242, "geo/layer_21/stable_rank_k_proj": 28.556407928466797, "geo/layer_21/stable_rank_o_proj": 64.80973815917969, "geo/layer_21/stable_rank_gate_proj": 59.78003692626953, "geo/layer_21/stable_rank_down_proj": 48.73615646362305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13606178760528564, "geo/layer_21/attn_entropy_mean": 5.850712299346924, "geo/layer_21/attn_entropy_std": 0.3408152759075165, "geo/layer_27/stable_rank_q_proj": 45.22466278076172, "geo/layer_27/stable_rank_k_proj": 30.606233596801758, "geo/layer_27/stable_rank_o_proj": 106.93109130859375, "geo/layer_27/stable_rank_gate_proj": 69.28424072265625, "geo/layer_27/stable_rank_down_proj": 130.15423583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09783431887626648, "geo/layer_27/attn_entropy_mean": 4.290433883666992, "geo/layer_27/attn_entropy_std": 0.7003340125083923, "attnres/final_alpha/block_0": 0.26316261291503906, "attnres/block_norm/0": 1.7831672430038452, "attnres/final_alpha/block_1": 0.0037964002694934607, "attnres/block_norm/1": 50722.80859375, "attnres/final_alpha/block_2": 0.00819362886250019, "attnres/block_norm/2": 30225.421875, "attnres/final_alpha/block_3": 0.010370399802923203, "attnres/block_norm/3": 73860.078125, "attnres/final_alpha/block_4": 0.01182185672223568, "attnres/block_norm/4": 17743.357421875, "attnres/final_alpha/block_5": 0.6043667793273926, "attnres/block_norm/5": 7207.6904296875, "attnres/final_alpha/block_6": 0.09828831255435944, "attnres/block_norm/6": 49521.82421875, "geo/tier1_time_s": 1.3556969165802002, "geo/step": 14475.0, "geo/rankme_slope": 0.00012091522155737294} {"step": 14480, "timestamp": 1778341359.3840759, "train/loss": 2.3080037593841554, "train/z_loss": 0.0013617199496366084, "train/perplexity": 10.054333739158919, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704398.1051711407, "perf/iters_per_sec": 0.8127203489165977, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2304355382919312, "data/tokens_consumed": 30368858112, "data/tokens_consumed_B": 30.368858112, "train/loss_slope": -5.846026420879337e-07} {"step": 14490, "timestamp": 1778341369.731551, "train/loss": 2.3157738208770753, "train/z_loss": 0.0013708960032090545, "train/perplexity": 10.132760827654272, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027709.108565708, "perf/iters_per_sec": 0.966887048991064, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342469692230225, "data/tokens_consumed": 30389829632, "data/tokens_consumed_B": 30.389829632, "train/loss_slope": -3.164755200037618e-07} {"step": 14500, "timestamp": 1778341380.0817502, "grad/layer_0/attn": 0.0031209494918584824, "grad/layer_0/mlp": 0.003362091025337577, "grad/layer_0/attn_mlp_ratio": 0.9282762945769704, "grad/layer_4/attn": 0.002022102940827608, "grad/layer_4/mlp": 0.0025981380604207516, "grad/layer_4/attn_mlp_ratio": 0.7782892271210634, "grad/layer_8/attn": 0.004198363516479731, "grad/layer_8/mlp": 0.0034968482796102762, "grad/layer_8/attn_mlp_ratio": 1.2006135413133463, "grad/layer_12/attn": 0.00598614476621151, "grad/layer_12/mlp": 0.006836378015577793, "grad/layer_12/attn_mlp_ratio": 0.8756310234758828, "grad/layer_16/attn": 0.004265377763658762, "grad/layer_16/mlp": 0.004408812616020441, "grad/layer_16/attn_mlp_ratio": 0.9674663085958551, "grad/layer_20/attn": 0.002571264747530222, "grad/layer_20/mlp": 0.005484164226800203, "grad/layer_20/attn_mlp_ratio": 0.46885260803817946, "grad/layer_24/attn": 0.009016248397529125, "grad/layer_24/mlp": 0.009668261744081974, "grad/layer_24/attn_mlp_ratio": 0.9325614617117604, "grad/layer_27/attn": 0.005486206151545048, "grad/layer_27/mlp": 0.009446720592677593, "grad/layer_27/attn_mlp_ratio": 0.5807524462745631} {"step": 14500, "timestamp": 1778341380.0960112, "train/loss": 2.3133724689483643, "train/z_loss": 0.0013639678596518933, "train/perplexity": 10.108457694763786, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024485.6189294124, "perf/iters_per_sec": 0.9653499693533957, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358937501907348, "data/tokens_consumed": 30410801152, "data/tokens_consumed_B": 30.410801152, "train/loss_slope": -1.9185635909782176e-06} {"step": 14500, "timestamp": 1778341387.2927644, "geo/ww_alpha_mean": 7.64504704685445, "geo/ww_alpha_std": 4.723266968056484, "geo/ww_alpha_min": 1.3643989326961858, "geo/ww_alpha_max": 34.817319900645906, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.917368423235015, "geo/ww_alpha_by_type/k_proj": 4.4298083374796935, "geo/ww_alpha_by_type/v_proj": 9.124876976965641, "geo/ww_alpha_by_type/o_proj": 8.057258055445748, "geo/ww_alpha_by_type/gate_proj": 7.624673956730284, "geo/ww_alpha_by_type/up_proj": 12.35886423122227, "geo/ww_alpha_by_type/down_proj": 8.099576304564525, "geo/twonn_id/layer_0": 0.763129472732544, "geo/twonn_id/layer_7": 3.890470266342163, "geo/twonn_id/layer_14": 4.96958589553833, "geo/twonn_id/layer_21": 6.856235027313232, "geo/twonn_id/layer_27": 5.95033597946167, "geo/tier2_time_s": 7.1891844272613525} {"step": 14500, "timestamp": 1778341388.1882544, "eoc/jacobian_sigma/layer_0/attn": 1558.017822265625, "eoc/jacobian_sigma/layer_0/mlp": 10350.9736328125, "eoc/jacobian_sigma/layer_0": 10350.9736328125, "eoc/jacobian_sigma/layer_7/attn": 1.139836311340332, "eoc/jacobian_sigma/layer_7/mlp": 1.8272415399551392, "eoc/jacobian_sigma/layer_7": 1.8272415399551392, "eoc/jacobian_sigma/layer_14/attn": 2.1182422637939453, "eoc/jacobian_sigma/layer_14/mlp": 12.782036781311035, "eoc/jacobian_sigma/layer_14": 12.782036781311035, "eoc/jacobian_sigma/layer_21/attn": 1.0908280611038208, "eoc/jacobian_sigma/layer_21/mlp": 6.380295753479004, "eoc/jacobian_sigma/layer_21": 6.380295753479004, "eoc/jacobian_sigma/layer_27/attn": 3.84792160987854, "eoc/jacobian_sigma/layer_27/mlp": 47.59278869628906, "eoc/jacobian_sigma/layer_27": 47.59278869628906, "eoc/layer0_sigma": 10350.9736328125, "eoc/sigma_max": 47.59278869628906, "eoc/sigma_min": 1.8272415399551392, "eoc/sigma_mean": 17.14559069275856, "eoc/time_s": 0.8857266902923584} {"step": 14510, "timestamp": 1778341398.556192, "train/loss": 2.3631418228149412, "train/z_loss": 0.0013705000397749244, "train/perplexity": 10.624278672033869, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1136408.9953970367, "perf/iters_per_sec": 0.5418820359215911, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8454200983047486, "data/tokens_consumed": 30431772672, "data/tokens_consumed_B": 30.431772672, "train/loss_slope": 4.867421804589004e-07} {"step": 14520, "timestamp": 1778341408.916892, "train/loss": 2.3405331134796143, "train/z_loss": 0.0013770215795375408, "train/perplexity": 10.38677241536622, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025191.406816014, "perf/iters_per_sec": 0.965686515243537, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355327367782592, "data/tokens_consumed": 30452744192, "data/tokens_consumed_B": 30.452744192, "train/loss_slope": -2.4694035441675784e-06} {"step": 14530, "timestamp": 1778341419.264896, "train/loss": 2.333831214904785, "train/z_loss": 0.0013592175440862776, "train/perplexity": 10.317394063138329, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028000.502846583, "perf/iters_per_sec": 0.9670259966118732, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340983629226685, "data/tokens_consumed": 30473715712, "data/tokens_consumed_B": 30.473715712, "train/loss_slope": -4.267772754104935e-06} {"step": 14540, "timestamp": 1778341429.6101367, "train/loss": 2.324350905418396, "train/z_loss": 0.001371538860257715, "train/perplexity": 10.220044157065196, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028636.222717455, "perf/iters_per_sec": 0.967329131468513, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337743043899537, "data/tokens_consumed": 30494687232, "data/tokens_consumed_B": 30.494687232, "train/loss_slope": -5.470942907278984e-06} {"step": 14550, "timestamp": 1778341439.9742475, "grad/layer_0/attn": 0.0033167661167681217, "grad/layer_0/mlp": 0.0035442959051579237, "grad/layer_0/attn_mlp_ratio": 0.9358039260663659, "grad/layer_4/attn": 0.0034718161914497614, "grad/layer_4/mlp": 0.002699992386624217, "grad/layer_4/attn_mlp_ratio": 1.285861426892543, "grad/layer_8/attn": 0.003013795707374811, "grad/layer_8/mlp": 0.003452091943472624, "grad/layer_8/attn_mlp_ratio": 0.8730345742296867, "grad/layer_12/attn": 0.00819083396345377, "grad/layer_12/mlp": 0.0070264930836856365, "grad/layer_12/attn_mlp_ratio": 1.1657072382097433, "grad/layer_16/attn": 0.003636727575212717, "grad/layer_16/mlp": 0.0050052460283041, "grad/layer_16/attn_mlp_ratio": 0.7265831653407482, "grad/layer_20/attn": 0.0032928616274148226, "grad/layer_20/mlp": 0.006666434928774834, "grad/layer_20/attn_mlp_ratio": 0.49394640661786954, "grad/layer_24/attn": 0.013285498134791851, "grad/layer_24/mlp": 0.010435784235596657, "grad/layer_24/attn_mlp_ratio": 1.2730713578925512, "grad/layer_27/attn": 0.013915854506194592, "grad/layer_27/mlp": 0.011313790455460548, "grad/layer_27/attn_mlp_ratio": 1.2299904649974425} {"step": 14550, "timestamp": 1778341440.6057436, "eos/sharpness": 63.78571987152098, "eos/L0_probe": 2.309392213821411, "eos/L_plus": 2.6988863945007324, "eos/L_minus": 2.5577552318573, "eos/grad_norm": 0.21176841855049133, "eos/embed_grad_frac": 0.05947655811905861, "eos/time_s": 0.6284329891204834} {"step": 14550, "timestamp": 1778341440.6261668, "train/loss": 2.34203245639801, "train/z_loss": 0.0013654589653015136, "train/perplexity": 10.402357429753035, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904834.2284886863, "perf/iters_per_sec": 0.9082957403605872, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1009629964828491, "data/tokens_consumed": 30515658752, "data/tokens_consumed_B": 30.515658752, "train/loss_slope": -4.821906903825117e-06} {"step": 14550, "timestamp": 1778341441.994703, "geo/rankme_last": 429.2495422363281, "geo/layer_0/stable_rank_q_proj": 20.650653839111328, "geo/layer_0/stable_rank_k_proj": 16.811704635620117, "geo/layer_0/stable_rank_o_proj": 43.822811126708984, "geo/layer_0/stable_rank_gate_proj": 124.6878433227539, "geo/layer_0/stable_rank_down_proj": 57.99508285522461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06110740080475807, "geo/layer_0/attn_entropy_mean": 6.230292320251465, "geo/layer_0/attn_entropy_std": 0.45921215415000916, "geo/layer_7/stable_rank_q_proj": 41.7071647644043, "geo/layer_7/stable_rank_k_proj": 38.58794021606445, "geo/layer_7/stable_rank_o_proj": 88.26237487792969, "geo/layer_7/stable_rank_gate_proj": 77.79646301269531, "geo/layer_7/stable_rank_down_proj": 144.6024627685547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40006113052368164, "geo/layer_7/attn_entropy_mean": 4.797218322753906, "geo/layer_7/attn_entropy_std": 0.7515610456466675, "geo/layer_14/stable_rank_q_proj": 51.66987228393555, "geo/layer_14/stable_rank_k_proj": 43.99980926513672, "geo/layer_14/stable_rank_o_proj": 42.32610321044922, "geo/layer_14/stable_rank_gate_proj": 72.0043716430664, "geo/layer_14/stable_rank_down_proj": 127.22671508789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.382943719625473, "geo/layer_14/attn_entropy_mean": 5.557607650756836, "geo/layer_14/attn_entropy_std": 0.4672814905643463, "geo/layer_21/stable_rank_q_proj": 38.190773010253906, "geo/layer_21/stable_rank_k_proj": 28.44028663635254, "geo/layer_21/stable_rank_o_proj": 64.82012939453125, "geo/layer_21/stable_rank_gate_proj": 59.80086898803711, "geo/layer_21/stable_rank_down_proj": 48.73143005371094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1354593187570572, "geo/layer_21/attn_entropy_mean": 5.868927955627441, "geo/layer_21/attn_entropy_std": 0.32120269536972046, "geo/layer_27/stable_rank_q_proj": 45.25773239135742, "geo/layer_27/stable_rank_k_proj": 30.63131332397461, "geo/layer_27/stable_rank_o_proj": 106.75347137451172, "geo/layer_27/stable_rank_gate_proj": 69.21305084228516, "geo/layer_27/stable_rank_down_proj": 130.00732421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10073229670524597, "geo/layer_27/attn_entropy_mean": 4.308345794677734, "geo/layer_27/attn_entropy_std": 0.7139031887054443, "attnres/final_alpha/block_0": 0.26103705167770386, "attnres/block_norm/0": 1.7833194732666016, "attnres/final_alpha/block_1": 0.0037686475552618504, "attnres/block_norm/1": 50730.609375, "attnres/final_alpha/block_2": 0.008160803467035294, "attnres/block_norm/2": 30062.326171875, "attnres/final_alpha/block_3": 0.01038143876940012, "attnres/block_norm/3": 74013.296875, "attnres/final_alpha/block_4": 0.011553674936294556, "attnres/block_norm/4": 17686.21484375, "attnres/final_alpha/block_5": 0.6084305644035339, "attnres/block_norm/5": 7145.53759765625, "attnres/final_alpha/block_6": 0.09666778147220612, "attnres/block_norm/6": 49804.890625, "geo/tier1_time_s": 1.3645973205566406, "geo/step": 14550.0, "geo/rankme_slope": 7.569209324354742e-05} {"step": 14560, "timestamp": 1778341452.3432066, "train/loss": 2.3145488023757936, "train/z_loss": 0.0013584744418039918, "train/perplexity": 10.120355608034451, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790421.2787014463, "perf/iters_per_sec": 0.8537393945224029, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1713176250457764, "data/tokens_consumed": 30536630272, "data/tokens_consumed_B": 30.536630272, "train/loss_slope": -7.220236546207071e-06} {"step": 14570, "timestamp": 1778341462.7048023, "train/loss": 2.3061145544052124, "train/z_loss": 0.0013694787630811334, "train/perplexity": 10.035356972943712, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024933.682674746, "perf/iters_per_sec": 0.9655636227964144, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356645345687867, "data/tokens_consumed": 30557601792, "data/tokens_consumed_B": 30.557601792, "train/loss_slope": -5.895807762386305e-06} {"step": 14580, "timestamp": 1778341473.0555737, "train/loss": 2.3165542125701903, "train/z_loss": 0.0013683898490853609, "train/perplexity": 10.140671436317668, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027615.9997329735, "perf/iters_per_sec": 0.9668426512398594, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342944622039796, "data/tokens_consumed": 30578573312, "data/tokens_consumed_B": 30.578573312, "train/loss_slope": -8.277676293630298e-06} {"step": 14590, "timestamp": 1778341483.4125535, "train/loss": 2.3099668979644776, "train/z_loss": 0.001367768459022045, "train/perplexity": 10.074091176570226, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026451.7399281736, "perf/iters_per_sec": 0.9662874889031284, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348886966705322, "data/tokens_consumed": 30599544832, "data/tokens_consumed_B": 30.599544832, "train/loss_slope": -9.400143596169563e-06} {"step": 14600, "timestamp": 1778341493.7469978, "grad/layer_0/attn": 0.003345862263813615, "grad/layer_0/mlp": 0.0035121766850352287, "grad/layer_0/attn_mlp_ratio": 0.9526463126997904, "grad/layer_4/attn": 0.0023710138630121946, "grad/layer_4/mlp": 0.0027368743903934956, "grad/layer_4/attn_mlp_ratio": 0.8663217371985885, "grad/layer_8/attn": 0.004030915908515453, "grad/layer_8/mlp": 0.003570751752704382, "grad/layer_8/attn_mlp_ratio": 1.1288703541419594, "grad/layer_12/attn": 0.006956988945603371, "grad/layer_12/mlp": 0.00791358295828104, "grad/layer_12/attn_mlp_ratio": 0.879119975663027, "grad/layer_16/attn": 0.003531503025442362, "grad/layer_16/mlp": 0.004575404338538647, "grad/layer_16/attn_mlp_ratio": 0.7718449970665981, "grad/layer_20/attn": 0.007583777420222759, "grad/layer_20/mlp": 0.005914234556257725, "grad/layer_20/attn_mlp_ratio": 1.2822922763469535, "grad/layer_24/attn": 0.006261513102799654, "grad/layer_24/mlp": 0.008723112754523754, "grad/layer_24/attn_mlp_ratio": 0.7178071873221806, "grad/layer_27/attn": 0.005360142793506384, "grad/layer_27/mlp": 0.009390346705913544, "grad/layer_27/attn_mlp_ratio": 0.5708141460899878} {"step": 14600, "timestamp": 1778341493.7612967, "train/loss": 2.3434866666793823, "train/z_loss": 0.0013679037103429437, "train/perplexity": 10.417495649287368, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027433.1255992372, "perf/iters_per_sec": 0.9667554500576196, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034387755393982, "data/tokens_consumed": 30620516352, "data/tokens_consumed_B": 30.620516352, "train/loss_slope": -7.72188285646798e-06} {"step": 14610, "timestamp": 1778341504.1204555, "train/loss": 2.2786206007003784, "train/z_loss": 0.0013741349801421165, "train/perplexity": 9.763203760396447, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026015.5571307656, "perf/iters_per_sec": 0.9660795007375553, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351114988327026, "data/tokens_consumed": 30641487872, "data/tokens_consumed_B": 30.641487872, "train/loss_slope": -9.160019834228244e-06} {"step": 14620, "timestamp": 1778341514.490557, "train/loss": 2.294340467453003, "train/z_loss": 0.0013722940464504063, "train/perplexity": 9.917892681731578, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023961.8399809091, "perf/iters_per_sec": 0.9651002120880647, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361618280410767, "data/tokens_consumed": 30662459392, "data/tokens_consumed_B": 30.662459392, "train/loss_slope": -1.4798746944510995e-05} {"step": 14625, "timestamp": 1778341520.265486, "eos/sharpness": 54.025673866271966, "eos/L0_probe": 2.3060243129730225, "eos/L_plus": 2.573718786239624, "eos/L_minus": 2.5785865783691406, "eos/grad_norm": 0.2066689133644104, "eos/embed_grad_frac": 0.05351141095161438, "eos/time_s": 0.5934383869171143} {"step": 14625, "timestamp": 1778341521.648364, "geo/rankme_last": 428.0323791503906, "geo/layer_0/stable_rank_q_proj": 20.647903442382812, "geo/layer_0/stable_rank_k_proj": 16.793167114257812, "geo/layer_0/stable_rank_o_proj": 43.7708854675293, "geo/layer_0/stable_rank_gate_proj": 124.49801635742188, "geo/layer_0/stable_rank_down_proj": 57.96212387084961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06376007199287415, "geo/layer_0/attn_entropy_mean": 6.232312202453613, "geo/layer_0/attn_entropy_std": 0.46324390172958374, "geo/layer_7/stable_rank_q_proj": 41.684696197509766, "geo/layer_7/stable_rank_k_proj": 38.52016830444336, "geo/layer_7/stable_rank_o_proj": 88.1488037109375, "geo/layer_7/stable_rank_gate_proj": 77.83064270019531, "geo/layer_7/stable_rank_down_proj": 144.6865692138672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3983501195907593, "geo/layer_7/attn_entropy_mean": 4.757205009460449, "geo/layer_7/attn_entropy_std": 0.7781963348388672, "geo/layer_14/stable_rank_q_proj": 51.68532180786133, "geo/layer_14/stable_rank_k_proj": 44.014278411865234, "geo/layer_14/stable_rank_o_proj": 42.314876556396484, "geo/layer_14/stable_rank_gate_proj": 72.06029510498047, "geo/layer_14/stable_rank_down_proj": 127.2452163696289, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38344624638557434, "geo/layer_14/attn_entropy_mean": 5.536700248718262, "geo/layer_14/attn_entropy_std": 0.4782240092754364, "geo/layer_21/stable_rank_q_proj": 38.11767578125, "geo/layer_21/stable_rank_k_proj": 28.333866119384766, "geo/layer_21/stable_rank_o_proj": 64.78096771240234, "geo/layer_21/stable_rank_gate_proj": 59.874568939208984, "geo/layer_21/stable_rank_down_proj": 48.72861099243164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1325341910123825, "geo/layer_21/attn_entropy_mean": 5.862897872924805, "geo/layer_21/attn_entropy_std": 0.32908815145492554, "geo/layer_27/stable_rank_q_proj": 45.23646545410156, "geo/layer_27/stable_rank_k_proj": 30.63755226135254, "geo/layer_27/stable_rank_o_proj": 106.83635711669922, "geo/layer_27/stable_rank_gate_proj": 69.21197509765625, "geo/layer_27/stable_rank_down_proj": 130.02976989746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10681188106536865, "geo/layer_27/attn_entropy_mean": 4.299083709716797, "geo/layer_27/attn_entropy_std": 0.7013877630233765, "attnres/final_alpha/block_0": 0.2627367079257965, "attnres/block_norm/0": 1.7832690477371216, "attnres/final_alpha/block_1": 0.0037521885242313147, "attnres/block_norm/1": 50815.078125, "attnres/final_alpha/block_2": 0.008230105973780155, "attnres/block_norm/2": 30070.98046875, "attnres/final_alpha/block_3": 0.01042760256677866, "attnres/block_norm/3": 73661.015625, "attnres/final_alpha/block_4": 0.011577618308365345, "attnres/block_norm/4": 17739.59765625, "attnres/final_alpha/block_5": 0.6046400666236877, "attnres/block_norm/5": 7239.98193359375, "attnres/final_alpha/block_6": 0.09863568842411041, "attnres/block_norm/6": 49837.51171875, "geo/tier1_time_s": 1.364367961883545, "geo/step": 14625.0, "geo/rankme_slope": 8.086105535964384e-06} {"step": 14630, "timestamp": 1778341526.8356473, "train/loss": 2.3394349813461304, "train/z_loss": 0.0013634139322675765, "train/perplexity": 10.375372627196153, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699526.213235189, "perf/iters_per_sec": 0.8103972498107858, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2339627265930175, "data/tokens_consumed": 30683430912, "data/tokens_consumed_B": 30.683430912, "train/loss_slope": -1.3162994284619751e-05} {"step": 14640, "timestamp": 1778341537.2032578, "train/loss": 2.312630367279053, "train/z_loss": 0.0013660942553542555, "train/perplexity": 10.100958974185005, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024518.8417296887, "perf/iters_per_sec": 0.965365811219067, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035876750946045, "data/tokens_consumed": 30704402432, "data/tokens_consumed_B": 30.704402432, "train/loss_slope": -1.3812543835827314e-05} {"step": 14650, "timestamp": 1778341547.578213, "grad/layer_0/attn": 0.0028796554543077946, "grad/layer_0/mlp": 0.0031555411405861378, "grad/layer_0/attn_mlp_ratio": 0.912571009141019, "grad/layer_4/attn": 0.0018795847427099943, "grad/layer_4/mlp": 0.002605208195745945, "grad/layer_4/attn_mlp_ratio": 0.721471962828913, "grad/layer_8/attn": 0.008093366399407387, "grad/layer_8/mlp": 0.003472992917522788, "grad/layer_8/attn_mlp_ratio": 2.330372205925197, "grad/layer_12/attn": 0.006655230186879635, "grad/layer_12/mlp": 0.0070088207721710205, "grad/layer_12/attn_mlp_ratio": 0.9495506174661504, "grad/layer_16/attn": 0.0034566058311611414, "grad/layer_16/mlp": 0.004439822863787413, "grad/layer_16/attn_mlp_ratio": 0.7785458698137971, "grad/layer_20/attn": 0.002794952131807804, "grad/layer_20/mlp": 0.005616898648440838, "grad/layer_20/attn_mlp_ratio": 0.49759702942547057, "grad/layer_24/attn": 0.004472092259675264, "grad/layer_24/mlp": 0.006966572720557451, "grad/layer_24/attn_mlp_ratio": 0.6419357659592247, "grad/layer_27/attn": 0.004595302511006594, "grad/layer_27/mlp": 0.006222118623554707, "grad/layer_27/attn_mlp_ratio": 0.7385430454115942} {"step": 14650, "timestamp": 1778341547.594636, "train/loss": 2.3343019485473633, "train/z_loss": 0.0013678902061656118, "train/perplexity": 10.322251950923485, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019646.43845638, "perf/iters_per_sec": 0.9630424682886028, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0383758068084716, "data/tokens_consumed": 30725373952, "data/tokens_consumed_B": 30.725373952, "train/loss_slope": -1.174341916251098e-05} {"step": 14660, "timestamp": 1778341558.3405368, "train/loss": 2.30730664730072, "train/z_loss": 0.0013651284156367184, "train/perplexity": 10.047327184079311, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952774.3137607148, "perf/iters_per_sec": 0.9311553543857168, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0739346504211427, "data/tokens_consumed": 30746345472, "data/tokens_consumed_B": 30.746345472, "train/loss_slope": -1.4817568678559691e-05} {"step": 14670, "timestamp": 1778341568.6909387, "train/loss": 2.28461229801178, "train/z_loss": 0.0013692568754777312, "train/perplexity": 9.821877524300621, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027796.6162269504, "perf/iters_per_sec": 0.9669287758955719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342023372650146, "data/tokens_consumed": 30767316992, "data/tokens_consumed_B": 30.767316992, "train/loss_slope": -1.5737336239155163e-05} {"step": 14680, "timestamp": 1778341579.0452487, "train/loss": 2.3401583671569823, "train/z_loss": 0.0013669229811057448, "train/perplexity": 10.382880739840637, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026878.5826636844, "perf/iters_per_sec": 0.966491023380129, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346707582473755, "data/tokens_consumed": 30788288512, "data/tokens_consumed_B": 30.788288512, "train/loss_slope": -1.8321070912862113e-05} {"step": 14690, "timestamp": 1778341589.3990254, "train/loss": 2.3382488012313845, "train/z_loss": 0.001367943303193897, "train/perplexity": 10.363072862812828, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026532.4158144346, "perf/iters_per_sec": 0.9663259581634687, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348474979400635, "data/tokens_consumed": 30809260032, "data/tokens_consumed_B": 30.809260032, "train/loss_slope": -1.4764456258247847e-05} {"step": 14700, "timestamp": 1778341599.7395232, "grad/layer_0/attn": 0.0028141711372882128, "grad/layer_0/mlp": 0.003199788276106119, "grad/layer_0/attn_mlp_ratio": 0.8794866430238205, "grad/layer_4/attn": 0.001984553411602974, "grad/layer_4/mlp": 0.0027400029357522726, "grad/layer_4/attn_mlp_ratio": 0.7242887638108442, "grad/layer_8/attn": 0.00430484302341938, "grad/layer_8/mlp": 0.0035872759763151407, "grad/layer_8/attn_mlp_ratio": 1.2000311467082083, "grad/layer_12/attn": 0.007077801041305065, "grad/layer_12/mlp": 0.007348326500505209, "grad/layer_12/attn_mlp_ratio": 0.9631854197686934, "grad/layer_16/attn": 0.0034786956384778023, "grad/layer_16/mlp": 0.005045704543590546, "grad/layer_16/attn_mlp_ratio": 0.6894370329219959, "grad/layer_20/attn": 0.005830432288348675, "grad/layer_20/mlp": 0.006281073205173016, "grad/layer_20/attn_mlp_ratio": 0.9282541382771002, "grad/layer_24/attn": 0.008214572444558144, "grad/layer_24/mlp": 0.010827803984284401, "grad/layer_24/attn_mlp_ratio": 0.7586554374843989, "grad/layer_27/attn": 0.010905471630394459, "grad/layer_27/mlp": 0.010511986911296844, "grad/layer_27/attn_mlp_ratio": 1.0374319925124291} {"step": 14700, "timestamp": 1778341600.3427896, "eos/sharpness": 12.516331672668455, "eos/L0_probe": 2.3064777851104736, "eos/L_plus": 2.370492696762085, "eos/L_minus": 2.367626190185547, "eos/grad_norm": 0.12852375209331512, "eos/embed_grad_frac": 0.16318875551223755, "eos/time_s": 0.6004254817962646} {"step": 14700, "timestamp": 1778341600.364301, "train/loss": 2.3017053842544555, "train/z_loss": 0.0013554617646150291, "train/perplexity": 9.99120678090702, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913764.7120374811, "perf/iters_per_sec": 0.9125541267573744, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0958254098892213, "data/tokens_consumed": 30830231552, "data/tokens_consumed_B": 30.830231552, "train/loss_slope": -1.5984882408528897e-05} {"step": 14700, "timestamp": 1778341601.730636, "geo/rankme_last": 429.8480529785156, "geo/layer_0/stable_rank_q_proj": 20.660917282104492, "geo/layer_0/stable_rank_k_proj": 16.79278564453125, "geo/layer_0/stable_rank_o_proj": 43.81209945678711, "geo/layer_0/stable_rank_gate_proj": 124.51881408691406, "geo/layer_0/stable_rank_down_proj": 57.94159698486328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06411917507648468, "geo/layer_0/attn_entropy_mean": 6.232417106628418, "geo/layer_0/attn_entropy_std": 0.45973077416419983, "geo/layer_7/stable_rank_q_proj": 41.72480010986328, "geo/layer_7/stable_rank_k_proj": 38.49106216430664, "geo/layer_7/stable_rank_o_proj": 88.12062072753906, "geo/layer_7/stable_rank_gate_proj": 77.75843811035156, "geo/layer_7/stable_rank_down_proj": 144.7452850341797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38531693816185, "geo/layer_7/attn_entropy_mean": 4.748889446258545, "geo/layer_7/attn_entropy_std": 0.7728245854377747, "geo/layer_14/stable_rank_q_proj": 51.62263870239258, "geo/layer_14/stable_rank_k_proj": 44.001564025878906, "geo/layer_14/stable_rank_o_proj": 42.26015090942383, "geo/layer_14/stable_rank_gate_proj": 71.98916625976562, "geo/layer_14/stable_rank_down_proj": 127.13651275634766, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3742157518863678, "geo/layer_14/attn_entropy_mean": 5.50137186050415, "geo/layer_14/attn_entropy_std": 0.46327629685401917, "geo/layer_21/stable_rank_q_proj": 38.14725112915039, "geo/layer_21/stable_rank_k_proj": 28.41708755493164, "geo/layer_21/stable_rank_o_proj": 64.77840423583984, "geo/layer_21/stable_rank_gate_proj": 59.7863883972168, "geo/layer_21/stable_rank_down_proj": 48.705570220947266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14119760692119598, "geo/layer_21/attn_entropy_mean": 5.841988563537598, "geo/layer_21/attn_entropy_std": 0.3267943561077118, "geo/layer_27/stable_rank_q_proj": 45.2720947265625, "geo/layer_27/stable_rank_k_proj": 30.59119415283203, "geo/layer_27/stable_rank_o_proj": 106.80235290527344, "geo/layer_27/stable_rank_gate_proj": 69.12931823730469, "geo/layer_27/stable_rank_down_proj": 129.95452880859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1029215008020401, "geo/layer_27/attn_entropy_mean": 4.278931617736816, "geo/layer_27/attn_entropy_std": 0.698512852191925, "attnres/final_alpha/block_0": 0.26066064834594727, "attnres/block_norm/0": 1.7834833860397339, "attnres/final_alpha/block_1": 0.0036749772261828184, "attnres/block_norm/1": 50806.46875, "attnres/final_alpha/block_2": 0.007916925475001335, "attnres/block_norm/2": 29993.21484375, "attnres/final_alpha/block_3": 0.01010514609515667, "attnres/block_norm/3": 73896.5, "attnres/final_alpha/block_4": 0.011386355385184288, "attnres/block_norm/4": 17698.126953125, "attnres/final_alpha/block_5": 0.6095775365829468, "attnres/block_norm/5": 7140.9150390625, "attnres/final_alpha/block_6": 0.0966784805059433, "attnres/block_norm/6": 49876.1328125, "geo/tier1_time_s": 1.3620469570159912, "geo/step": 14700.0, "geo/rankme_slope": 8.599533563425371e-06} {"step": 14710, "timestamp": 1778341612.0921683, "train/loss": 2.305457520484924, "train/z_loss": 0.0013748890138231217, "train/perplexity": 10.028765568635515, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788802.2816643585, "perf/iters_per_sec": 0.8529673965760987, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1723777532577515, "data/tokens_consumed": 30851203072, "data/tokens_consumed_B": 30.851203072, "train/loss_slope": -1.77724957144228e-05} {"step": 14720, "timestamp": 1778341623.0473137, "train/loss": 2.3589749574661254, "train/z_loss": 0.001371828792616725, "train/perplexity": 10.580100838841133, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915027.2098644716, "perf/iters_per_sec": 0.9131561326334341, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0951029777526855, "data/tokens_consumed": 30872174592, "data/tokens_consumed_B": 30.872174592, "train/loss_slope": -1.686921458755066e-05} {"step": 14730, "timestamp": 1778341633.409931, "train/loss": 2.3689979791641234, "train/z_loss": 0.0013556616264395415, "train/perplexity": 10.686678642695206, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025458.757690318, "perf/iters_per_sec": 0.9658139980746832, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353960514068603, "data/tokens_consumed": 30893146112, "data/tokens_consumed_B": 30.893146112, "train/loss_slope": -1.523071123678836e-05} {"step": 14740, "timestamp": 1778341643.7734754, "train/loss": 2.343570518493652, "train/z_loss": 0.0013667411636561154, "train/perplexity": 10.418369211822098, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024568.6080763873, "perf/iters_per_sec": 0.9653895416624009, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035851287841797, "data/tokens_consumed": 30914117632, "data/tokens_consumed_B": 30.914117632, "train/loss_slope": -1.7068769393151803e-05} {"step": 14750, "timestamp": 1778341654.1124663, "grad/layer_0/attn": 0.003152222605422139, "grad/layer_0/mlp": 0.003461553016677499, "grad/layer_0/attn_mlp_ratio": 0.9106382306355407, "grad/layer_4/attn": 0.002536248881369829, "grad/layer_4/mlp": 0.0026841890066862106, "grad/layer_4/attn_mlp_ratio": 0.9448845742843274, "grad/layer_8/attn": 0.003395556937903166, "grad/layer_8/mlp": 0.0034391426015645266, "grad/layer_8/attn_mlp_ratio": 0.9873265614591878, "grad/layer_12/attn": 0.008042935281991959, "grad/layer_12/mlp": 0.007261793129146099, "grad/layer_12/attn_mlp_ratio": 1.1075687544655843, "grad/layer_16/attn": 0.006742625031620264, "grad/layer_16/mlp": 0.005123435519635677, "grad/layer_16/attn_mlp_ratio": 1.3160358658121922, "grad/layer_20/attn": 0.002702783327549696, "grad/layer_20/mlp": 0.006008629687130451, "grad/layer_20/attn_mlp_ratio": 0.44981691721773853, "grad/layer_24/attn": 0.00968212354928255, "grad/layer_24/mlp": 0.009396079927682877, "grad/layer_24/attn_mlp_ratio": 1.0304428571017838, "grad/layer_27/attn": 0.01400196086615324, "grad/layer_27/mlp": 0.008482106029987335, "grad/layer_27/attn_mlp_ratio": 1.6507646392976867} {"step": 14750, "timestamp": 1778341654.1283305, "train/loss": 2.3409825563430786, "train/z_loss": 0.0013677041395567357, "train/perplexity": 10.391441725318153, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026287.8401296996, "perf/iters_per_sec": 0.966209335388994, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349724054336549, "data/tokens_consumed": 30935089152, "data/tokens_consumed_B": 30.935089152, "train/loss_slope": -1.4072234988963704e-05} {"step": 14760, "timestamp": 1778341664.479282, "train/loss": 2.3678145170211793, "train/z_loss": 0.0013599158963188529, "train/perplexity": 10.674038843924693, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027431.9573297852, "perf/iters_per_sec": 0.9667548929833342, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343883514404297, "data/tokens_consumed": 30956060672, "data/tokens_consumed_B": 30.956060672, "train/loss_slope": -1.0635307333757689e-05} {"step": 14770, "timestamp": 1778341674.8291993, "train/loss": 2.3485876083374024, "train/z_loss": 0.0013685802114196122, "train/perplexity": 10.470770447119133, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027216.598176156, "perf/iters_per_sec": 0.9666522017365246, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344982385635375, "data/tokens_consumed": 30977032192, "data/tokens_consumed_B": 30.977032192, "train/loss_slope": -9.541851015660359e-06} {"step": 14775, "timestamp": 1778341680.593371, "eos/sharpness": 27.502369880676262, "eos/L0_probe": 2.3060600757598877, "eos/L_plus": 2.463311195373535, "eos/L_minus": 2.423832654953003, "eos/grad_norm": 0.1002747043967247, "eos/embed_grad_frac": 0.24292831122875214, "eos/time_s": 0.5978207588195801} {"step": 14775, "timestamp": 1778341681.9729514, "geo/rankme_last": 430.1398010253906, "geo/layer_0/stable_rank_q_proj": 20.658437728881836, "geo/layer_0/stable_rank_k_proj": 16.788808822631836, "geo/layer_0/stable_rank_o_proj": 43.856815338134766, "geo/layer_0/stable_rank_gate_proj": 124.29138946533203, "geo/layer_0/stable_rank_down_proj": 57.94016647338867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0624828040599823, "geo/layer_0/attn_entropy_mean": 6.229326248168945, "geo/layer_0/attn_entropy_std": 0.46365755796432495, "geo/layer_7/stable_rank_q_proj": 41.75836181640625, "geo/layer_7/stable_rank_k_proj": 38.5078239440918, "geo/layer_7/stable_rank_o_proj": 87.94237518310547, "geo/layer_7/stable_rank_gate_proj": 77.62088012695312, "geo/layer_7/stable_rank_down_proj": 144.20187377929688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3882429599761963, "geo/layer_7/attn_entropy_mean": 4.763596534729004, "geo/layer_7/attn_entropy_std": 0.7609077095985413, "geo/layer_14/stable_rank_q_proj": 51.52949523925781, "geo/layer_14/stable_rank_k_proj": 44.035823822021484, "geo/layer_14/stable_rank_o_proj": 42.249263763427734, "geo/layer_14/stable_rank_gate_proj": 71.8622055053711, "geo/layer_14/stable_rank_down_proj": 126.998779296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37858280539512634, "geo/layer_14/attn_entropy_mean": 5.548990726470947, "geo/layer_14/attn_entropy_std": 0.47397956252098083, "geo/layer_21/stable_rank_q_proj": 38.20075225830078, "geo/layer_21/stable_rank_k_proj": 28.434715270996094, "geo/layer_21/stable_rank_o_proj": 64.67817687988281, "geo/layer_21/stable_rank_gate_proj": 59.75859832763672, "geo/layer_21/stable_rank_down_proj": 48.711883544921875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13543088734149933, "geo/layer_21/attn_entropy_mean": 5.859553813934326, "geo/layer_21/attn_entropy_std": 0.3224514126777649, "geo/layer_27/stable_rank_q_proj": 45.284751892089844, "geo/layer_27/stable_rank_k_proj": 30.642959594726562, "geo/layer_27/stable_rank_o_proj": 106.63724517822266, "geo/layer_27/stable_rank_gate_proj": 69.13661193847656, "geo/layer_27/stable_rank_down_proj": 130.04226684570312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10014894604682922, "geo/layer_27/attn_entropy_mean": 4.309957981109619, "geo/layer_27/attn_entropy_std": 0.6775742173194885, "attnres/final_alpha/block_0": 0.2615416944026947, "attnres/block_norm/0": 1.7836534976959229, "attnres/final_alpha/block_1": 0.003771443385630846, "attnres/block_norm/1": 50726.05859375, "attnres/final_alpha/block_2": 0.00798753835260868, "attnres/block_norm/2": 30160.03125, "attnres/final_alpha/block_3": 0.010233568027615547, "attnres/block_norm/3": 74104.078125, "attnres/final_alpha/block_4": 0.011602101847529411, "attnres/block_norm/4": 17805.78125, "attnres/final_alpha/block_5": 0.6066725850105286, "attnres/block_norm/5": 7189.4228515625, "attnres/final_alpha/block_6": 0.09819108247756958, "attnres/block_norm/6": 49678.4921875, "geo/tier1_time_s": 1.358293056488037, "geo/step": 14775.0, "geo/rankme_slope": 2.7641290891356543e-05} {"step": 14780, "timestamp": 1778341687.1657352, "train/loss": 2.3390368938446047, "train/z_loss": 0.0013779538916423918, "train/perplexity": 10.371243143032146, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702945.0245186836, "perf/iters_per_sec": 0.8120274660676401, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2314854383468627, "data/tokens_consumed": 30998003712, "data/tokens_consumed_B": 30.998003712, "train/loss_slope": -8.696257012976988e-06} {"step": 14790, "timestamp": 1778341697.5275154, "train/loss": 2.3228265285491942, "train/z_loss": 0.0013545211986638606, "train/perplexity": 10.204476826403846, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025711.6704464506, "perf/iters_per_sec": 0.9659345962745908, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352667808532714, "data/tokens_consumed": 31018975232, "data/tokens_consumed_B": 31.018975232, "train/loss_slope": -8.898256907332874e-06} {"step": 14800, "timestamp": 1778341707.8785589, "grad/layer_0/attn": 0.0028753553051501513, "grad/layer_0/mlp": 0.0036426305305212736, "grad/layer_0/attn_mlp_ratio": 0.7893623034566855, "grad/layer_4/attn": 0.001964562339708209, "grad/layer_4/mlp": 0.0025879302993416786, "grad/layer_4/attn_mlp_ratio": 0.759124874536022, "grad/layer_8/attn": 0.009166677482426167, "grad/layer_8/mlp": 0.0035619880072772503, "grad/layer_8/attn_mlp_ratio": 2.573472231335743, "grad/layer_12/attn": 0.006577345076948404, "grad/layer_12/mlp": 0.006938189268112183, "grad/layer_12/attn_mlp_ratio": 0.9479915764735086, "grad/layer_16/attn": 0.004161973483860493, "grad/layer_16/mlp": 0.004527505021542311, "grad/layer_16/attn_mlp_ratio": 0.9192642243644116, "grad/layer_20/attn": 0.0028181993402540684, "grad/layer_20/mlp": 0.005454606376588345, "grad/layer_20/attn_mlp_ratio": 0.5166640989317983, "grad/layer_24/attn": 0.009950070641934872, "grad/layer_24/mlp": 0.008514342829585075, "grad/layer_24/attn_mlp_ratio": 1.168624604884192, "grad/layer_27/attn": 0.00504686776548624, "grad/layer_27/mlp": 0.009431962855160236, "grad/layer_27/attn_mlp_ratio": 0.5350813812012583} {"step": 14800, "timestamp": 1778341707.8950043, "train/loss": 2.335385537147522, "train/z_loss": 0.0013664302998222411, "train/perplexity": 10.33344308766455, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023865.675598379, "perf/iters_per_sec": 0.9650543573371787, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362110614776612, "data/tokens_consumed": 31039946752, "data/tokens_consumed_B": 31.039946752, "train/loss_slope": -8.462414425341456e-06} {"step": 14810, "timestamp": 1778341718.7042372, "train/loss": 2.31775164604187, "train/z_loss": 0.0013661280507221818, "train/perplexity": 10.152821488708753, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1941749.5044701488, "perf/iters_per_sec": 0.9258983156538719, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0800322055816651, "data/tokens_consumed": 31060918272, "data/tokens_consumed_B": 31.060918272, "train/loss_slope": -9.172828026516039e-06} {"step": 14820, "timestamp": 1778341729.0978107, "train/loss": 2.3328169107437136, "train/z_loss": 0.0013689774903468787, "train/perplexity": 10.306934392948925, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019303.432671176, "perf/iters_per_sec": 0.9628789103847389, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.038552188873291, "data/tokens_consumed": 31081889792, "data/tokens_consumed_B": 31.081889792, "train/loss_slope": -9.268544595090617e-06} {"step": 14830, "timestamp": 1778341739.4531507, "train/loss": 2.3742597579956053, "train/z_loss": 0.0013588113710284234, "train/perplexity": 10.743057779352599, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.0451827645, "perf/iters_per_sec": 0.9662304140008757, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034949827194214, "data/tokens_consumed": 31102861312, "data/tokens_consumed_B": 31.102861312, "train/loss_slope": -6.357513474087588e-06} {"step": 14840, "timestamp": 1778341749.832514, "train/loss": 2.3702016115188598, "train/z_loss": 0.0013611430767923594, "train/perplexity": 10.69954921904145, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021679.3216128657, "perf/iters_per_sec": 0.9640118225159005, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373316764831544, "data/tokens_consumed": 31123832832, "data/tokens_consumed_B": 31.123832832, "train/loss_slope": -5.6351920058339e-06} {"step": 14850, "timestamp": 1778341760.181056, "grad/layer_0/attn": 0.003102246206253767, "grad/layer_0/mlp": 0.00341858621686697, "grad/layer_0/attn_mlp_ratio": 0.9074646414360193, "grad/layer_4/attn": 0.0021172550041228533, "grad/layer_4/mlp": 0.0025357131380587816, "grad/layer_4/attn_mlp_ratio": 0.8349741494206657, "grad/layer_8/attn": 0.00419236533343792, "grad/layer_8/mlp": 0.003421907080337405, "grad/layer_8/attn_mlp_ratio": 1.2251546031194644, "grad/layer_12/attn": 0.009929650463163853, "grad/layer_12/mlp": 0.006893749348819256, "grad/layer_12/attn_mlp_ratio": 1.4403845885151192, "grad/layer_16/attn": 0.005719670560210943, "grad/layer_16/mlp": 0.004373772069811821, "grad/layer_16/attn_mlp_ratio": 1.307720278547803, "grad/layer_20/attn": 0.0035584717988967896, "grad/layer_20/mlp": 0.005943778436630964, "grad/layer_20/attn_mlp_ratio": 0.5986884903208041, "grad/layer_24/attn": 0.010066584683954716, "grad/layer_24/mlp": 0.011082284152507782, "grad/layer_24/attn_mlp_ratio": 0.9083492585634387, "grad/layer_27/attn": 0.004863799549639225, "grad/layer_27/mlp": 0.01178759429603815, "grad/layer_27/attn_mlp_ratio": 0.41262019935755206} {"step": 14850, "timestamp": 1778341760.7868338, "eos/sharpness": 53.91933917999267, "eos/L0_probe": 2.3062877655029297, "eos/L_plus": 2.5751893520355225, "eos/L_minus": 2.5765795707702637, "eos/grad_norm": 0.17349913716316223, "eos/embed_grad_frac": 0.09108483791351318, "eos/time_s": 0.6030375957489014} {"step": 14850, "timestamp": 1778341760.807169, "train/loss": 2.3680830001831055, "train/z_loss": 0.001361248386092484, "train/perplexity": 10.676905028367948, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912111.706620995, "perf/iters_per_sec": 0.9117659123520827, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0967727422714233, "data/tokens_consumed": 31144804352, "data/tokens_consumed_B": 31.144804352, "train/loss_slope": -2.163760997567354e-06} {"step": 14850, "timestamp": 1778341762.1723108, "geo/rankme_last": 430.76849365234375, "geo/layer_0/stable_rank_q_proj": 20.657014846801758, "geo/layer_0/stable_rank_k_proj": 16.77320098876953, "geo/layer_0/stable_rank_o_proj": 43.8417854309082, "geo/layer_0/stable_rank_gate_proj": 124.273193359375, "geo/layer_0/stable_rank_down_proj": 57.8713493347168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06725160032510757, "geo/layer_0/attn_entropy_mean": 6.225549697875977, "geo/layer_0/attn_entropy_std": 0.459723562002182, "geo/layer_7/stable_rank_q_proj": 41.681793212890625, "geo/layer_7/stable_rank_k_proj": 38.56428527832031, "geo/layer_7/stable_rank_o_proj": 87.85960388183594, "geo/layer_7/stable_rank_gate_proj": 77.5231704711914, "geo/layer_7/stable_rank_down_proj": 143.9772491455078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38397496938705444, "geo/layer_7/attn_entropy_mean": 4.772353649139404, "geo/layer_7/attn_entropy_std": 0.7554721236228943, "geo/layer_14/stable_rank_q_proj": 51.593196868896484, "geo/layer_14/stable_rank_k_proj": 44.09497833251953, "geo/layer_14/stable_rank_o_proj": 42.30815887451172, "geo/layer_14/stable_rank_gate_proj": 71.9011001586914, "geo/layer_14/stable_rank_down_proj": 127.20986938476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36263811588287354, "geo/layer_14/attn_entropy_mean": 5.510867118835449, "geo/layer_14/attn_entropy_std": 0.4909435510635376, "geo/layer_21/stable_rank_q_proj": 38.24833297729492, "geo/layer_21/stable_rank_k_proj": 28.49424934387207, "geo/layer_21/stable_rank_o_proj": 64.64058685302734, "geo/layer_21/stable_rank_gate_proj": 59.77659225463867, "geo/layer_21/stable_rank_down_proj": 48.65388488769531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1333167999982834, "geo/layer_21/attn_entropy_mean": 5.8479084968566895, "geo/layer_21/attn_entropy_std": 0.33180880546569824, "geo/layer_27/stable_rank_q_proj": 45.25611877441406, "geo/layer_27/stable_rank_k_proj": 30.51981544494629, "geo/layer_27/stable_rank_o_proj": 106.47622680664062, "geo/layer_27/stable_rank_gate_proj": 69.10481262207031, "geo/layer_27/stable_rank_down_proj": 129.95449829101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10005923360586166, "geo/layer_27/attn_entropy_mean": 4.295191764831543, "geo/layer_27/attn_entropy_std": 0.7081498503684998, "attnres/final_alpha/block_0": 0.26455259323120117, "attnres/block_norm/0": 1.783726453781128, "attnres/final_alpha/block_1": 0.0037831792142242193, "attnres/block_norm/1": 50830.640625, "attnres/final_alpha/block_2": 0.00829320214688778, "attnres/block_norm/2": 30077.49609375, "attnres/final_alpha/block_3": 0.010601917281746864, "attnres/block_norm/3": 73901.6796875, "attnres/final_alpha/block_4": 0.011888202279806137, "attnres/block_norm/4": 17828.0390625, "attnres/final_alpha/block_5": 0.6012077331542969, "attnres/block_norm/5": 7239.6787109375, "attnres/final_alpha/block_6": 0.09967318177223206, "attnres/block_norm/6": 49460.68359375, "geo/tier1_time_s": 1.3608269691467285, "geo/step": 14850.0, "geo/rankme_slope": 5.8724935286614645e-05} {"step": 14860, "timestamp": 1778341772.523588, "train/loss": 2.3678963661193846, "train/z_loss": 0.0013780749519355594, "train/perplexity": 10.674912540133413, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790563.5291690931, "perf/iters_per_sec": 0.8538072248311487, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1712245702743531, "data/tokens_consumed": 31165775872, "data/tokens_consumed_B": 31.165775872, "train/loss_slope": -7.853818304575492e-07} {"step": 14870, "timestamp": 1778341783.397049, "train/loss": 2.3018311500549316, "train/z_loss": 0.0013699701870791615, "train/perplexity": 9.992463412044497, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930099.0769782779, "perf/iters_per_sec": 0.9203429589167966, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0865514755249024, "data/tokens_consumed": 31186747392, "data/tokens_consumed_B": 31.186747392, "train/loss_slope": 1.5452785818130174e-07} {"step": 14880, "timestamp": 1778341793.7508826, "train/loss": 2.3531080722808837, "train/z_loss": 0.0013728399761021137, "train/perplexity": 10.518210331745232, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026938.4605565197, "perf/iters_per_sec": 0.9665195753843878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346401929855347, "data/tokens_consumed": 31207718912, "data/tokens_consumed_B": 31.207718912, "train/loss_slope": -1.1895619388675564e-06} {"step": 14890, "timestamp": 1778341804.1100364, "train/loss": 2.3189621925354005, "train/z_loss": 0.0013739592861384154, "train/perplexity": 10.165119393252159, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025513.9807592945, "perf/iters_per_sec": 0.9658403304859612, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353678226470948, "data/tokens_consumed": 31228690432, "data/tokens_consumed_B": 31.228690432, "train/loss_slope": -6.648126036206152e-07} {"step": 14900, "timestamp": 1778341814.473745, "grad/layer_0/attn": 0.0031133766751736403, "grad/layer_0/mlp": 0.003660105401650071, "grad/layer_0/attn_mlp_ratio": 0.8506248450406823, "grad/layer_4/attn": 0.001844000886194408, "grad/layer_4/mlp": 0.002644765190780163, "grad/layer_4/attn_mlp_ratio": 0.6972266660571825, "grad/layer_8/attn": 0.0038673109374940395, "grad/layer_8/mlp": 0.0035299991723150015, "grad/layer_8/attn_mlp_ratio": 1.0955557322134593, "grad/layer_12/attn": 0.005176295526325703, "grad/layer_12/mlp": 0.006933184806257486, "grad/layer_12/attn_mlp_ratio": 0.7465970684921271, "grad/layer_16/attn": 0.003503351239487529, "grad/layer_16/mlp": 0.004439251031726599, "grad/layer_16/attn_mlp_ratio": 0.7891761775876236, "grad/layer_20/attn": 0.004956298507750034, "grad/layer_20/mlp": 0.006853068247437477, "grad/layer_20/attn_mlp_ratio": 0.7232232711648514, "grad/layer_24/attn": 0.012905933894217014, "grad/layer_24/mlp": 0.01330550480633974, "grad/layer_24/attn_mlp_ratio": 0.9699694964651555, "grad/layer_27/attn": 0.004906133748590946, "grad/layer_27/mlp": 0.014602979645133018, "grad/layer_27/attn_mlp_ratio": 0.3359679896992322} {"step": 14900, "timestamp": 1778341814.489687, "train/loss": 2.3451367616653442, "train/z_loss": 0.0013722338248044253, "train/perplexity": 10.434699696877253, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021589.0423236885, "perf/iters_per_sec": 0.9639687739962046, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373780012130738, "data/tokens_consumed": 31249661952, "data/tokens_consumed_B": 31.249661952, "train/loss_slope": -3.694559481280175e-08} {"step": 14910, "timestamp": 1778341824.8486826, "train/loss": 2.2957610845565797, "train/z_loss": 0.0013547747628763318, "train/perplexity": 9.931992222359716, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025556.0530574012, "perf/iters_per_sec": 0.9658603921210295, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353463172912598, "data/tokens_consumed": 31270633472, "data/tokens_consumed_B": 31.270633472, "train/loss_slope": -4.8033067352452e-07} {"step": 14920, "timestamp": 1778341835.2048807, "train/loss": 2.325610876083374, "train/z_loss": 0.00136997353984043, "train/perplexity": 10.232929228599335, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026045.6101457283, "perf/iters_per_sec": 0.9660938311318056, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350961446762086, "data/tokens_consumed": 31291604992, "data/tokens_consumed_B": 31.291604992, "train/loss_slope": 1.8949949451655833e-06} {"step": 14925, "timestamp": 1778341840.976646, "eos/sharpness": 57.75399208068846, "eos/L0_probe": 2.303102731704712, "eos/L_plus": 2.5715770721435547, "eos/L_minus": 2.612168312072754, "eos/grad_norm": 0.16418005526065826, "eos/embed_grad_frac": 0.09364407509565353, "eos/time_s": 0.5991060733795166} {"step": 14925, "timestamp": 1778341842.3557584, "geo/rankme_last": 429.49420166015625, "geo/layer_0/stable_rank_q_proj": 20.638063430786133, "geo/layer_0/stable_rank_k_proj": 16.76502227783203, "geo/layer_0/stable_rank_o_proj": 43.81193161010742, "geo/layer_0/stable_rank_gate_proj": 124.21052551269531, "geo/layer_0/stable_rank_down_proj": 57.82463073730469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06349791586399078, "geo/layer_0/attn_entropy_mean": 6.22901725769043, "geo/layer_0/attn_entropy_std": 0.45831814408302307, "geo/layer_7/stable_rank_q_proj": 41.66307067871094, "geo/layer_7/stable_rank_k_proj": 38.45896530151367, "geo/layer_7/stable_rank_o_proj": 87.89091491699219, "geo/layer_7/stable_rank_gate_proj": 77.58802032470703, "geo/layer_7/stable_rank_down_proj": 144.16726684570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3994653820991516, "geo/layer_7/attn_entropy_mean": 4.7127299308776855, "geo/layer_7/attn_entropy_std": 0.7610653638839722, "geo/layer_14/stable_rank_q_proj": 51.55260467529297, "geo/layer_14/stable_rank_k_proj": 44.062564849853516, "geo/layer_14/stable_rank_o_proj": 42.296199798583984, "geo/layer_14/stable_rank_gate_proj": 71.8546371459961, "geo/layer_14/stable_rank_down_proj": 127.17208862304688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3626616299152374, "geo/layer_14/attn_entropy_mean": 5.535000801086426, "geo/layer_14/attn_entropy_std": 0.4775369465351105, "geo/layer_21/stable_rank_q_proj": 38.203800201416016, "geo/layer_21/stable_rank_k_proj": 28.485004425048828, "geo/layer_21/stable_rank_o_proj": 64.56503295898438, "geo/layer_21/stable_rank_gate_proj": 59.81558609008789, "geo/layer_21/stable_rank_down_proj": 48.6286735534668, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.131562277674675, "geo/layer_21/attn_entropy_mean": 5.843236923217773, "geo/layer_21/attn_entropy_std": 0.3317597806453705, "geo/layer_27/stable_rank_q_proj": 45.19358825683594, "geo/layer_27/stable_rank_k_proj": 30.509178161621094, "geo/layer_27/stable_rank_o_proj": 106.50019073486328, "geo/layer_27/stable_rank_gate_proj": 69.15845489501953, "geo/layer_27/stable_rank_down_proj": 129.83226013183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09833342581987381, "geo/layer_27/attn_entropy_mean": 4.308250427246094, "geo/layer_27/attn_entropy_std": 0.6958962082862854, "attnres/final_alpha/block_0": 0.26477864384651184, "attnres/block_norm/0": 1.7838246822357178, "attnres/final_alpha/block_1": 0.0037333073560148478, "attnres/block_norm/1": 50720.2578125, "attnres/final_alpha/block_2": 0.008060035295784473, "attnres/block_norm/2": 30103.8515625, "attnres/final_alpha/block_3": 0.01040972676128149, "attnres/block_norm/3": 74073.5859375, "attnres/final_alpha/block_4": 0.011895956471562386, "attnres/block_norm/4": 17744.42578125, "attnres/final_alpha/block_5": 0.6022264361381531, "attnres/block_norm/5": 7270.76318359375, "attnres/final_alpha/block_6": 0.09889590740203857, "attnres/block_norm/6": 49548.07421875, "geo/tier1_time_s": 1.3593909740447998, "geo/step": 14925.0, "geo/rankme_slope": 4.997332917542016e-05} {"step": 14930, "timestamp": 1778341847.5536213, "train/loss": 2.34423291683197, "train/z_loss": 0.001366386201698333, "train/perplexity": 10.425272608422794, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699309.2856452481, "perf/iters_per_sec": 0.8102938106752625, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.23412024974823, "data/tokens_consumed": 31312576512, "data/tokens_consumed_B": 31.312576512, "train/loss_slope": 3.207814196298466e-06} {"step": 14940, "timestamp": 1778341857.902356, "train/loss": 2.349855399131775, "train/z_loss": 0.0013722999952733516, "train/perplexity": 10.484053611859805, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027435.8359895512, "perf/iters_per_sec": 0.9667567424724346, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343863725662232, "data/tokens_consumed": 31333548032, "data/tokens_consumed_B": 31.333548032, "train/loss_slope": 4.228084580232301e-06} {"step": 14950, "timestamp": 1778341868.2393095, "grad/layer_0/attn": 0.0028890823014080524, "grad/layer_0/mlp": 0.0033880374394357204, "grad/layer_0/attn_mlp_ratio": 0.8527303100334688, "grad/layer_4/attn": 0.00254677701741457, "grad/layer_4/mlp": 0.0026429533027112484, "grad/layer_4/attn_mlp_ratio": 0.9636102607037956, "grad/layer_8/attn": 0.008952470496296883, "grad/layer_8/mlp": 0.003501635044813156, "grad/layer_8/attn_mlp_ratio": 2.5566542846583684, "grad/layer_12/attn": 0.006438417825847864, "grad/layer_12/mlp": 0.007702680304646492, "grad/layer_12/attn_mlp_ratio": 0.8358671900711361, "grad/layer_16/attn": 0.003940236289054155, "grad/layer_16/mlp": 0.004589108284562826, "grad/layer_16/attn_mlp_ratio": 0.8586060643737696, "grad/layer_20/attn": 0.002628571819514036, "grad/layer_20/mlp": 0.006375672295689583, "grad/layer_20/attn_mlp_ratio": 0.4122815063852938, "grad/layer_24/attn": 0.010455765761435032, "grad/layer_24/mlp": 0.011287259869277477, "grad/layer_24/attn_mlp_ratio": 0.9263333873672026, "grad/layer_27/attn": 0.0062165954150259495, "grad/layer_27/mlp": 0.012275069952011108, "grad/layer_27/attn_mlp_ratio": 0.5064407281331517} {"step": 14950, "timestamp": 1778341868.2551498, "train/loss": 2.350669503211975, "train/z_loss": 0.0013665943290106953, "train/perplexity": 10.492592197859517, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026664.3683472842, "perf/iters_per_sec": 0.9663888780342503, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347801208496095, "data/tokens_consumed": 31354519552, "data/tokens_consumed_B": 31.354519552, "train/loss_slope": 5.607972315328451e-06} {"step": 14960, "timestamp": 1778341878.6035779, "train/loss": 2.2857128858566282, "train/z_loss": 0.001372830884065479, "train/perplexity": 9.832693314089086, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027986.896661373, "perf/iters_per_sec": 0.9670195086771837, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341053009033203, "data/tokens_consumed": 31375491072, "data/tokens_consumed_B": 31.375491072, "train/loss_slope": 4.615998260974164e-06} {"step": 14970, "timestamp": 1778341888.9645622, "train/loss": 2.328592538833618, "train/z_loss": 0.001382807968184352, "train/perplexity": 10.263485904719417, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024996.0563430085, "perf/iters_per_sec": 0.9655933648791354, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356326341629027, "data/tokens_consumed": 31396462592, "data/tokens_consumed_B": 31.396462592, "train/loss_slope": 7.234405305268975e-06} {"step": 14980, "timestamp": 1778341899.3225183, "train/loss": 2.3496024131774904, "train/z_loss": 0.0013757244218140841, "train/perplexity": 10.481401629023386, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025624.6691688562, "perf/iters_per_sec": 0.9658931108326226, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353112459182738, "data/tokens_consumed": 31417434112, "data/tokens_consumed_B": 31.417434112, "train/loss_slope": 7.26714969718559e-06} {"step": 14990, "timestamp": 1778341909.6798072, "train/loss": 2.340031957626343, "train/z_loss": 0.0013576839701272547, "train/perplexity": 10.381568327712076, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026435.820271972, "perf/iters_per_sec": 0.9662798978195057, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348968267440797, "data/tokens_consumed": 31438405632, "data/tokens_consumed_B": 31.438405632, "train/loss_slope": 9.576168569615755e-06} {"step": 15000, "timestamp": 1778341920.029299, "grad/layer_0/attn": 0.0026768094394356012, "grad/layer_0/mlp": 0.003173199715092778, "grad/layer_0/attn_mlp_ratio": 0.8435678795592444, "grad/layer_4/attn": 0.0020251963287591934, "grad/layer_4/mlp": 0.0025918232277035713, "grad/layer_4/attn_mlp_ratio": 0.781379003388157, "grad/layer_8/attn": 0.004104153253138065, "grad/layer_8/mlp": 0.00344491726718843, "grad/layer_8/attn_mlp_ratio": 1.1913647892482457, "grad/layer_12/attn": 0.005939126946032047, "grad/layer_12/mlp": 0.007015022449195385, "grad/layer_12/attn_mlp_ratio": 0.8466297726602829, "grad/layer_16/attn": 0.006554707419127226, "grad/layer_16/mlp": 0.0053474451415240765, "grad/layer_16/attn_mlp_ratio": 1.2257642898759387, "grad/layer_20/attn": 0.003531056921929121, "grad/layer_20/mlp": 0.0066377329640090466, "grad/layer_20/attn_mlp_ratio": 0.5319672978528063, "grad/layer_24/attn": 0.013004635460674763, "grad/layer_24/mlp": 0.011106234043836594, "grad/layer_24/attn_mlp_ratio": 1.1709311448193884, "grad/layer_27/attn": 0.004663575440645218, "grad/layer_27/mlp": 0.013012262992560863, "grad/layer_27/attn_mlp_ratio": 0.35839848975320776} {"step": 15000, "timestamp": 1778341920.639202, "eos/sharpness": 57.86950588226317, "eos/L0_probe": 2.304286479949951, "eos/L_plus": 2.5887436866760254, "eos/L_minus": 2.598524332046509, "eos/grad_norm": 0.19649402797222137, "eos/embed_grad_frac": 0.05867790803313255, "eos/time_s": 0.6072454452514648} {"step": 15000, "timestamp": 1778341920.6584475, "train/loss": 2.344960618019104, "train/z_loss": 0.0013677514041773974, "train/perplexity": 10.43286185269226, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911033.7638700292, "perf/iters_per_sec": 0.9112519091940066, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.097391390800476, "data/tokens_consumed": 31459377152, "data/tokens_consumed_B": 31.459377152, "train/loss_slope": 1.0975399569566627e-05} {"step": 15000, "timestamp": 1778341922.028537, "geo/rankme_last": 430.0276184082031, "geo/layer_0/stable_rank_q_proj": 20.652587890625, "geo/layer_0/stable_rank_k_proj": 16.781673431396484, "geo/layer_0/stable_rank_o_proj": 43.728675842285156, "geo/layer_0/stable_rank_gate_proj": 123.98949432373047, "geo/layer_0/stable_rank_down_proj": 57.89474105834961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0663251206278801, "geo/layer_0/attn_entropy_mean": 6.231705665588379, "geo/layer_0/attn_entropy_std": 0.4585259258747101, "geo/layer_7/stable_rank_q_proj": 41.60981750488281, "geo/layer_7/stable_rank_k_proj": 38.50068283081055, "geo/layer_7/stable_rank_o_proj": 87.97459411621094, "geo/layer_7/stable_rank_gate_proj": 77.61648559570312, "geo/layer_7/stable_rank_down_proj": 144.08993530273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40090954303741455, "geo/layer_7/attn_entropy_mean": 4.736868381500244, "geo/layer_7/attn_entropy_std": 0.7542335391044617, "geo/layer_14/stable_rank_q_proj": 51.61418151855469, "geo/layer_14/stable_rank_k_proj": 44.18110275268555, "geo/layer_14/stable_rank_o_proj": 42.28334045410156, "geo/layer_14/stable_rank_gate_proj": 71.81270599365234, "geo/layer_14/stable_rank_down_proj": 126.72989654541016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3604724407196045, "geo/layer_14/attn_entropy_mean": 5.489961624145508, "geo/layer_14/attn_entropy_std": 0.491591215133667, "geo/layer_21/stable_rank_q_proj": 38.22842025756836, "geo/layer_21/stable_rank_k_proj": 28.502527236938477, "geo/layer_21/stable_rank_o_proj": 64.49803161621094, "geo/layer_21/stable_rank_gate_proj": 59.72517395019531, "geo/layer_21/stable_rank_down_proj": 48.626590728759766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13753584027290344, "geo/layer_21/attn_entropy_mean": 5.853405952453613, "geo/layer_21/attn_entropy_std": 0.32859352231025696, "geo/layer_27/stable_rank_q_proj": 45.235740661621094, "geo/layer_27/stable_rank_k_proj": 30.532682418823242, "geo/layer_27/stable_rank_o_proj": 106.67009735107422, "geo/layer_27/stable_rank_gate_proj": 69.15162658691406, "geo/layer_27/stable_rank_down_proj": 129.8152313232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09124476462602615, "geo/layer_27/attn_entropy_mean": 4.324914932250977, "geo/layer_27/attn_entropy_std": 0.7018023729324341, "attnres/final_alpha/block_0": 0.2650134563446045, "attnres/block_norm/0": 1.7840571403503418, "attnres/final_alpha/block_1": 0.0037465195637196302, "attnres/block_norm/1": 50845.2734375, "attnres/final_alpha/block_2": 0.008219699375331402, "attnres/block_norm/2": 30007.849609375, "attnres/final_alpha/block_3": 0.010473478585481644, "attnres/block_norm/3": 73565.515625, "attnres/final_alpha/block_4": 0.011971558444201946, "attnres/block_norm/4": 17785.533203125, "attnres/final_alpha/block_5": 0.6002405881881714, "attnres/block_norm/5": 7333.662109375, "attnres/final_alpha/block_6": 0.10033471882343292, "attnres/block_norm/6": 49562.30078125, "geo/tier1_time_s": 1.365859031677246, "geo/step": 15000.0, "geo/rankme_slope": 6.677129836309523e-05} {"step": 15000, "timestamp": 1778341929.0943484, "geo/ww_alpha_mean": 7.859654967637468, "geo/ww_alpha_std": 5.27938160505043, "geo/ww_alpha_min": 1.331931552505269, "geo/ww_alpha_max": 45.16429854125567, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.913304898943621, "geo/ww_alpha_by_type/k_proj": 4.4588224905065115, "geo/ww_alpha_by_type/v_proj": 9.456006737173777, "geo/ww_alpha_by_type/o_proj": 9.27952292526875, "geo/ww_alpha_by_type/gate_proj": 7.840617094738207, "geo/ww_alpha_by_type/up_proj": 12.056679400525892, "geo/ww_alpha_by_type/down_proj": 8.116921530610343, "geo/twonn_id/layer_0": 0.6899185180664062, "geo/twonn_id/layer_7": 3.7160491943359375, "geo/twonn_id/layer_14": 5.232442855834961, "geo/twonn_id/layer_21": 7.763797283172607, "geo/twonn_id/layer_27": 6.291449069976807, "geo/tier2_time_s": 7.059159278869629} {"step": 15000, "timestamp": 1778341929.8806877, "eoc/jacobian_sigma/layer_0/attn": 1556.0450439453125, "eoc/jacobian_sigma/layer_0/mlp": 10697.0947265625, "eoc/jacobian_sigma/layer_0": 10697.0947265625, "eoc/jacobian_sigma/layer_7/attn": 1.141912817955017, "eoc/jacobian_sigma/layer_7/mlp": 1.8406058549880981, "eoc/jacobian_sigma/layer_7": 1.8406058549880981, "eoc/jacobian_sigma/layer_14/attn": 2.075983762741089, "eoc/jacobian_sigma/layer_14/mlp": 14.553672790527344, "eoc/jacobian_sigma/layer_14": 14.553672790527344, "eoc/jacobian_sigma/layer_21/attn": 1.0926308631896973, "eoc/jacobian_sigma/layer_21/mlp": 5.839081764221191, "eoc/jacobian_sigma/layer_21": 5.839081764221191, "eoc/jacobian_sigma/layer_27/attn": 4.03477668762207, "eoc/jacobian_sigma/layer_27/mlp": 44.72996520996094, "eoc/jacobian_sigma/layer_27": 44.72996520996094, "eoc/layer0_sigma": 10697.0947265625, "eoc/sigma_max": 44.72996520996094, "eoc/sigma_min": 1.8406058549880981, "eoc/sigma_mean": 16.740831404924393, "eoc/time_s": 0.7776696681976318} {"step": 15010, "timestamp": 1778341940.2543333, "train/loss": 2.330435800552368, "train/z_loss": 0.001365089404862374, "train/perplexity": 10.282421641786781, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1070415.8777513297, "perf/iters_per_sec": 0.5104140652424477, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9591936588287353, "data/tokens_consumed": 31480348672, "data/tokens_consumed_B": 31.480348672, "train/loss_slope": 1.0399244828085387e-05} {"step": 15020, "timestamp": 1778341950.6095858, "train/loss": 2.2782490730285643, "train/z_loss": 0.0013686748570762575, "train/perplexity": 9.759577133771689, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026285.3661974485, "perf/iters_per_sec": 0.9662081557261698, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034973669052124, "data/tokens_consumed": 31501320192, "data/tokens_consumed_B": 31.501320192, "train/loss_slope": 7.059230999012338e-06} {"step": 15030, "timestamp": 1778341960.9672904, "train/loss": 2.323444652557373, "train/z_loss": 0.001373136939946562, "train/perplexity": 10.210786408372302, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025686.0124562252, "perf/iters_per_sec": 0.9659223615914465, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035279893875122, "data/tokens_consumed": 31522291712, "data/tokens_consumed_B": 31.522291712, "train/loss_slope": 3.9494344932768505e-06} {"step": 15040, "timestamp": 1778341971.3200734, "train/loss": 2.311211371421814, "train/z_loss": 0.0013668196392245592, "train/perplexity": 10.086635919827241, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027082.8919972393, "perf/iters_per_sec": 0.9665884456621358, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345664739608764, "data/tokens_consumed": 31543263232, "data/tokens_consumed_B": 31.543263232, "train/loss_slope": 1.2478361321467763e-06} {"step": 15050, "timestamp": 1778341981.6839767, "grad/layer_0/attn": 0.0029246811755001545, "grad/layer_0/mlp": 0.0031902252230793238, "grad/layer_0/attn_mlp_ratio": 0.9167631998723976, "grad/layer_4/attn": 0.0023837604094296694, "grad/layer_4/mlp": 0.0025814236141741276, "grad/layer_4/attn_mlp_ratio": 0.9234285701881776, "grad/layer_8/attn": 0.0029762755148112774, "grad/layer_8/mlp": 0.003319408278912306, "grad/layer_8/attn_mlp_ratio": 0.8966283069353868, "grad/layer_12/attn": 0.005292064044624567, "grad/layer_12/mlp": 0.007400279864668846, "grad/layer_12/attn_mlp_ratio": 0.7151167347573966, "grad/layer_16/attn": 0.0037267659790813923, "grad/layer_16/mlp": 0.0046147871762514114, "grad/layer_16/attn_mlp_ratio": 0.8075704807153412, "grad/layer_20/attn": 0.004498188383877277, "grad/layer_20/mlp": 0.005745778325945139, "grad/layer_20/attn_mlp_ratio": 0.7828684036205866, "grad/layer_24/attn": 0.009322690777480602, "grad/layer_24/mlp": 0.010102330707013607, "grad/layer_24/attn_mlp_ratio": 0.9228257276041947, "grad/layer_27/attn": 0.0046085030771791935, "grad/layer_27/mlp": 0.009763206355273724, "grad/layer_27/attn_mlp_ratio": 0.47202761698128903} {"step": 15050, "timestamp": 1778341981.6999464, "train/loss": 2.3084412813186646, "train/z_loss": 0.001369636319577694, "train/perplexity": 10.058733693174686, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021760.8726265277, "perf/iters_per_sec": 0.9640507090695036, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037289834022522, "data/tokens_consumed": 31564234752, "data/tokens_consumed_B": 31.564234752, "train/loss_slope": 3.748790158403632e-07} {"step": 15060, "timestamp": 1778341992.0641174, "train/loss": 2.2952563047409056, "train/z_loss": 0.0013649244094267488, "train/perplexity": 9.926980018292586, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024462.927405037, "perf/iters_per_sec": 0.9653391491913972, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035905361175537, "data/tokens_consumed": 31585206272, "data/tokens_consumed_B": 31.585206272, "train/loss_slope": -2.872321128559148e-06} {"step": 15070, "timestamp": 1778342002.4174404, "train/loss": 2.280926775932312, "train/z_loss": 0.0013775951811112463, "train/perplexity": 9.785745401589761, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026987.8321198036, "perf/iters_per_sec": 0.9665431175803202, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346149921417236, "data/tokens_consumed": 31606177792, "data/tokens_consumed_B": 31.606177792, "train/loss_slope": -3.7220882933096694e-06} {"step": 15075, "timestamp": 1778342008.1908724, "eos/sharpness": 50.020098686218255, "eos/L0_probe": 2.3056793212890625, "eos/L_plus": 2.5368704795837402, "eos/L_minus": 2.5746891498565674, "eos/grad_norm": 0.1350032091140747, "eos/embed_grad_frac": 0.11215472221374512, "eos/time_s": 0.6067755222320557} {"step": 15075, "timestamp": 1778342009.5649471, "geo/rankme_last": 429.5595397949219, "geo/layer_0/stable_rank_q_proj": 20.663583755493164, "geo/layer_0/stable_rank_k_proj": 16.805082321166992, "geo/layer_0/stable_rank_o_proj": 43.738101959228516, "geo/layer_0/stable_rank_gate_proj": 123.71051788330078, "geo/layer_0/stable_rank_down_proj": 57.83842468261719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06598053872585297, "geo/layer_0/attn_entropy_mean": 6.233376502990723, "geo/layer_0/attn_entropy_std": 0.4591582417488098, "geo/layer_7/stable_rank_q_proj": 41.60514831542969, "geo/layer_7/stable_rank_k_proj": 38.395103454589844, "geo/layer_7/stable_rank_o_proj": 87.64458465576172, "geo/layer_7/stable_rank_gate_proj": 77.50938415527344, "geo/layer_7/stable_rank_down_proj": 143.8758544921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40180703997612, "geo/layer_7/attn_entropy_mean": 4.756279945373535, "geo/layer_7/attn_entropy_std": 0.7810009717941284, "geo/layer_14/stable_rank_q_proj": 51.6132698059082, "geo/layer_14/stable_rank_k_proj": 44.29401779174805, "geo/layer_14/stable_rank_o_proj": 42.23377990722656, "geo/layer_14/stable_rank_gate_proj": 71.95672607421875, "geo/layer_14/stable_rank_down_proj": 126.67761993408203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3756929337978363, "geo/layer_14/attn_entropy_mean": 5.538277626037598, "geo/layer_14/attn_entropy_std": 0.5013554692268372, "geo/layer_21/stable_rank_q_proj": 38.261898040771484, "geo/layer_21/stable_rank_k_proj": 28.50517463684082, "geo/layer_21/stable_rank_o_proj": 64.57283782958984, "geo/layer_21/stable_rank_gate_proj": 59.6441535949707, "geo/layer_21/stable_rank_down_proj": 48.60424041748047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1343916803598404, "geo/layer_21/attn_entropy_mean": 5.876102447509766, "geo/layer_21/attn_entropy_std": 0.33526667952537537, "geo/layer_27/stable_rank_q_proj": 45.287784576416016, "geo/layer_27/stable_rank_k_proj": 30.495649337768555, "geo/layer_27/stable_rank_o_proj": 106.86528015136719, "geo/layer_27/stable_rank_gate_proj": 69.11306762695312, "geo/layer_27/stable_rank_down_proj": 129.79959106445312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11235851794481277, "geo/layer_27/attn_entropy_mean": 4.344947338104248, "geo/layer_27/attn_entropy_std": 0.7029030323028564, "attnres/final_alpha/block_0": 0.26527491211891174, "attnres/block_norm/0": 1.7841522693634033, "attnres/final_alpha/block_1": 0.003740868531167507, "attnres/block_norm/1": 50760.7734375, "attnres/final_alpha/block_2": 0.008097055368125439, "attnres/block_norm/2": 29962.44140625, "attnres/final_alpha/block_3": 0.010345269925892353, "attnres/block_norm/3": 73554.8984375, "attnres/final_alpha/block_4": 0.01199403777718544, "attnres/block_norm/4": 17800.17578125, "attnres/final_alpha/block_5": 0.6001914739608765, "attnres/block_norm/5": 7289.25390625, "attnres/final_alpha/block_6": 0.10035637021064758, "attnres/block_norm/6": 49587.5, "geo/tier1_time_s": 1.3547725677490234, "geo/step": 15075.0, "geo/rankme_slope": 7.97535225027511e-05} {"step": 15080, "timestamp": 1778342014.7407026, "train/loss": 2.3010827779769896, "train/z_loss": 0.0013657979434356093, "train/perplexity": 9.984988128932528, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702694.2959527427, "perf/iters_per_sec": 0.8119079093707765, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2316667795181275, "data/tokens_consumed": 31627149312, "data/tokens_consumed_B": 31.627149312, "train/loss_slope": -3.6373957191328164e-06} {"step": 15090, "timestamp": 1778342025.095555, "train/loss": 2.334352970123291, "train/z_loss": 0.0013696507667191327, "train/perplexity": 10.322778621920822, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026969.241598905, "perf/iters_per_sec": 0.9665342529291654, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346244812011718, "data/tokens_consumed": 31648120832, "data/tokens_consumed_B": 31.648120832, "train/loss_slope": -4.523148280595022e-06} {"step": 15100, "timestamp": 1778342035.4448771, "grad/layer_0/attn": 0.0034429861698299646, "grad/layer_0/mlp": 0.0037417523562908173, "grad/layer_0/attn_mlp_ratio": 0.920153380013541, "grad/layer_4/attn": 0.0025642390828579664, "grad/layer_4/mlp": 0.002657833509147167, "grad/layer_4/attn_mlp_ratio": 0.9647854079476281, "grad/layer_8/attn": 0.0045345621183514595, "grad/layer_8/mlp": 0.003475121920928359, "grad/layer_8/attn_mlp_ratio": 1.3048641431992298, "grad/layer_12/attn": 0.005401041358709335, "grad/layer_12/mlp": 0.006984483916312456, "grad/layer_12/attn_mlp_ratio": 0.7732913907591532, "grad/layer_16/attn": 0.0037426436319947243, "grad/layer_16/mlp": 0.005006854888051748, "grad/layer_16/attn_mlp_ratio": 0.7475039003378546, "grad/layer_20/attn": 0.004719522316008806, "grad/layer_20/mlp": 0.0067428601905703545, "grad/layer_20/attn_mlp_ratio": 0.6999288302931157, "grad/layer_24/attn": 0.013815206475555897, "grad/layer_24/mlp": 0.013387767598032951, "grad/layer_24/attn_mlp_ratio": 1.0319275615744177, "grad/layer_27/attn": 0.005901777651160955, "grad/layer_27/mlp": 0.013812539167702198, "grad/layer_27/attn_mlp_ratio": 0.42727680528380885} {"step": 15100, "timestamp": 1778342035.4605238, "train/loss": 2.3022232055664062, "train/z_loss": 0.0013663850375451147, "train/perplexity": 9.996381780457174, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024596.474571259, "perf/iters_per_sec": 0.9654028294426246, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358370304107667, "data/tokens_consumed": 31669092352, "data/tokens_consumed_B": 31.669092352, "train/loss_slope": -6.099895239234063e-06} {"step": 15110, "timestamp": 1778342045.809069, "train/loss": 2.3717281103134153, "train/z_loss": 0.001363575633149594, "train/perplexity": 10.715894540409298, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027440.041781972, "perf/iters_per_sec": 0.9667587479505405, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343842267990113, "data/tokens_consumed": 31690063872, "data/tokens_consumed_B": 31.690063872, "train/loss_slope": -1.4168144881886962e-06} {"step": 15120, "timestamp": 1778342056.1619973, "train/loss": 2.3240733623504637, "train/z_loss": 0.0013742375653237105, "train/perplexity": 10.217208048244816, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027111.62192104, "perf/iters_per_sec": 0.9666021451573562, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345518112182617, "data/tokens_consumed": 31711035392, "data/tokens_consumed_B": 31.711035392, "train/loss_slope": -2.752033540375727e-07} {"step": 15130, "timestamp": 1778342066.5095963, "train/loss": 2.3487955570220946, "train/z_loss": 0.0013651571469381451, "train/perplexity": 10.472948056468978, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027794.9333160832, "perf/iters_per_sec": 0.9669279734211365, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342031955718993, "data/tokens_consumed": 31732006912, "data/tokens_consumed_B": 31.732006912, "train/loss_slope": -5.635039927257172e-07} {"step": 15140, "timestamp": 1778342076.8600607, "train/loss": 2.348214602470398, "train/z_loss": 0.0013619105215184391, "train/perplexity": 10.466865516636567, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026914.312805889, "perf/iters_per_sec": 0.9665080608396001, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346525192260743, "data/tokens_consumed": 31752978432, "data/tokens_consumed_B": 31.752978432, "train/loss_slope": 8.295642052760295e-07} {"step": 15150, "timestamp": 1778342087.2086742, "grad/layer_0/attn": 0.002886332804337144, "grad/layer_0/mlp": 0.0031124164815992117, "grad/layer_0/attn_mlp_ratio": 0.9273606950307718, "grad/layer_4/attn": 0.002125686267390847, "grad/layer_4/mlp": 0.0028118425980210304, "grad/layer_4/attn_mlp_ratio": 0.7559762389577842, "grad/layer_8/attn": 0.004525851458311081, "grad/layer_8/mlp": 0.003468089969828725, "grad/layer_8/attn_mlp_ratio": 1.3049982460618719, "grad/layer_12/attn": 0.00763136800378561, "grad/layer_12/mlp": 0.007487640716135502, "grad/layer_12/attn_mlp_ratio": 1.019195256714289, "grad/layer_16/attn": 0.004284538794308901, "grad/layer_16/mlp": 0.004990620072931051, "grad/layer_16/attn_mlp_ratio": 0.8585183095175404, "grad/layer_20/attn": 0.0027012100908905268, "grad/layer_20/mlp": 0.005605958867818117, "grad/layer_20/attn_mlp_ratio": 0.48184621157544055, "grad/layer_24/attn": 0.007176211569458246, "grad/layer_24/mlp": 0.008915924467146397, "grad/layer_24/attn_mlp_ratio": 0.8048757608270168, "grad/layer_27/attn": 0.007550816051661968, "grad/layer_27/mlp": 0.008238798007369041, "grad/layer_27/attn_mlp_ratio": 0.9164948519503445} {"step": 15150, "timestamp": 1778342087.8127477, "eos/sharpness": 56.51285648345946, "eos/L0_probe": 2.3077259063720703, "eos/L_plus": 2.651276111602783, "eos/L_minus": 2.529304265975952, "eos/grad_norm": 0.14552365243434906, "eos/embed_grad_frac": 0.10680298507213593, "eos/time_s": 0.6014387607574463} {"step": 15150, "timestamp": 1778342087.8337376, "train/loss": 2.323033094406128, "train/z_loss": 0.0013767576892860234, "train/perplexity": 10.206584940628767, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912315.5257264157, "perf/iters_per_sec": 0.911863100875099, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0966558456420898, "data/tokens_consumed": 31773949952, "data/tokens_consumed_B": 31.773949952, "train/loss_slope": 3.5705975144251164e-06} {"step": 15150, "timestamp": 1778342089.2002532, "geo/rankme_last": 430.0087585449219, "geo/layer_0/stable_rank_q_proj": 20.658130645751953, "geo/layer_0/stable_rank_k_proj": 16.83121109008789, "geo/layer_0/stable_rank_o_proj": 43.7656135559082, "geo/layer_0/stable_rank_gate_proj": 123.82469177246094, "geo/layer_0/stable_rank_down_proj": 57.86960983276367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06451001018285751, "geo/layer_0/attn_entropy_mean": 6.2354936599731445, "geo/layer_0/attn_entropy_std": 0.46252432465553284, "geo/layer_7/stable_rank_q_proj": 41.616737365722656, "geo/layer_7/stable_rank_k_proj": 38.43075180053711, "geo/layer_7/stable_rank_o_proj": 87.45440673828125, "geo/layer_7/stable_rank_gate_proj": 77.4603500366211, "geo/layer_7/stable_rank_down_proj": 143.90113830566406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3974412679672241, "geo/layer_7/attn_entropy_mean": 4.77987813949585, "geo/layer_7/attn_entropy_std": 0.7742457985877991, "geo/layer_14/stable_rank_q_proj": 51.53179168701172, "geo/layer_14/stable_rank_k_proj": 44.40834426879883, "geo/layer_14/stable_rank_o_proj": 42.19398498535156, "geo/layer_14/stable_rank_gate_proj": 72.0392074584961, "geo/layer_14/stable_rank_down_proj": 127.01789855957031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3590339422225952, "geo/layer_14/attn_entropy_mean": 5.531153678894043, "geo/layer_14/attn_entropy_std": 0.4654959440231323, "geo/layer_21/stable_rank_q_proj": 38.196346282958984, "geo/layer_21/stable_rank_k_proj": 28.515689849853516, "geo/layer_21/stable_rank_o_proj": 64.65888977050781, "geo/layer_21/stable_rank_gate_proj": 59.61560821533203, "geo/layer_21/stable_rank_down_proj": 48.565879821777344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13689737021923065, "geo/layer_21/attn_entropy_mean": 5.861424922943115, "geo/layer_21/attn_entropy_std": 0.33526450395584106, "geo/layer_27/stable_rank_q_proj": 45.34021759033203, "geo/layer_27/stable_rank_k_proj": 30.452083587646484, "geo/layer_27/stable_rank_o_proj": 106.78043365478516, "geo/layer_27/stable_rank_gate_proj": 69.06138610839844, "geo/layer_27/stable_rank_down_proj": 129.8152618408203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09464234113693237, "geo/layer_27/attn_entropy_mean": 4.320262908935547, "geo/layer_27/attn_entropy_std": 0.6973660588264465, "attnres/final_alpha/block_0": 0.26224368810653687, "attnres/block_norm/0": 1.7841670513153076, "attnres/final_alpha/block_1": 0.0037032240070402622, "attnres/block_norm/1": 50750.27734375, "attnres/final_alpha/block_2": 0.007937338203191757, "attnres/block_norm/2": 30124.63671875, "attnres/final_alpha/block_3": 0.010164722800254822, "attnres/block_norm/3": 73982.0625, "attnres/final_alpha/block_4": 0.011474169790744781, "attnres/block_norm/4": 17730.080078125, "attnres/final_alpha/block_5": 0.6080682277679443, "attnres/block_norm/5": 7238.3955078125, "attnres/final_alpha/block_6": 0.0964086502790451, "attnres/block_norm/6": 49731.2578125, "geo/tier1_time_s": 1.3624534606933594, "geo/step": 15150.0, "geo/rankme_slope": 9.618427449104641e-05} {"step": 15160, "timestamp": 1778342099.5567644, "train/loss": 2.30555522441864, "train/z_loss": 0.00137878421228379, "train/perplexity": 10.029745466351038, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789500.16839108, "perf/iters_per_sec": 0.8533001748996163, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1719205379486084, "data/tokens_consumed": 31794921472, "data/tokens_consumed_B": 31.794921472, "train/loss_slope": 2.7305485570605195e-06} {"step": 15170, "timestamp": 1778342109.904636, "train/loss": 2.349122428894043, "train/z_loss": 0.0013576066121459007, "train/perplexity": 10.476371928158201, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027691.2526951458, "perf/iters_per_sec": 0.9668785346484879, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342560768127442, "data/tokens_consumed": 31815892992, "data/tokens_consumed_B": 31.815892992, "train/loss_slope": 5.872343983551406e-06} {"step": 15180, "timestamp": 1778342120.2618058, "train/loss": 2.3513169288635254, "train/z_loss": 0.0013684909790754318, "train/perplexity": 10.499387570711654, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026212.5979491011, "perf/iters_per_sec": 0.9661734571214204, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350108385086059, "data/tokens_consumed": 31836864512, "data/tokens_consumed_B": 31.836864512, "train/loss_slope": 6.273062151185742e-06} {"step": 15190, "timestamp": 1778342130.6088295, "train/loss": 2.3342449426651, "train/z_loss": 0.0013583490625023841, "train/perplexity": 10.321663538615722, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027630.7226990974, "perf/iters_per_sec": 0.9668496716971862, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342869520187379, "data/tokens_consumed": 31857836032, "data/tokens_consumed_B": 31.857836032, "train/loss_slope": 5.550779024473723e-06} {"step": 15200, "timestamp": 1778342140.9618733, "grad/layer_0/attn": 0.003257542848587036, "grad/layer_0/mlp": 0.0035728258080780506, "grad/layer_0/attn_mlp_ratio": 0.9117552694694222, "grad/layer_4/attn": 0.0031404022593051195, "grad/layer_4/mlp": 0.0026896093040704727, "grad/layer_4/attn_mlp_ratio": 1.1676053238631654, "grad/layer_8/attn": 0.006619624327868223, "grad/layer_8/mlp": 0.0037864719051867723, "grad/layer_8/attn_mlp_ratio": 1.7482300988362125, "grad/layer_12/attn": 0.010808511637151241, "grad/layer_12/mlp": 0.007333602290600538, "grad/layer_12/attn_mlp_ratio": 1.4738338761049394, "grad/layer_16/attn": 0.003927404526621103, "grad/layer_16/mlp": 0.004555441439151764, "grad/layer_16/attn_mlp_ratio": 0.8621347662717229, "grad/layer_20/attn": 0.004520813934504986, "grad/layer_20/mlp": 0.0055432566441595554, "grad/layer_20/attn_mlp_ratio": 0.8155519657768282, "grad/layer_24/attn": 0.004878732841461897, "grad/layer_24/mlp": 0.008303103968501091, "grad/layer_24/attn_mlp_ratio": 0.5875793921420311, "grad/layer_27/attn": 0.0048376284539699554, "grad/layer_27/mlp": 0.0070023443549871445, "grad/layer_27/attn_mlp_ratio": 0.6908583953656472} {"step": 15200, "timestamp": 1778342140.9777207, "train/loss": 2.30502393245697, "train/z_loss": 0.0013794972095638514, "train/perplexity": 10.024418158510398, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023793.220725172, "perf/iters_per_sec": 0.9650198081613407, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362481594085693, "data/tokens_consumed": 31878807552, "data/tokens_consumed_B": 31.878807552, "train/loss_slope": 1.5807330912381349e-06} {"step": 15210, "timestamp": 1778342151.329026, "train/loss": 2.2962960243225097, "train/z_loss": 0.00137049478944391, "train/perplexity": 9.937306661279987, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027442.5652657999, "perf/iters_per_sec": 0.9667599512413978, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034382939338684, "data/tokens_consumed": 31899779072, "data/tokens_consumed_B": 31.899779072, "train/loss_slope": 2.7479885077275674e-07} {"step": 15220, "timestamp": 1778342161.6986177, "train/loss": 2.3368748664855956, "train/z_loss": 0.0013532933895476162, "train/perplexity": 10.348844453624592, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023380.7086510044, "perf/iters_per_sec": 0.9648231070761701, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364594221115113, "data/tokens_consumed": 31920750592, "data/tokens_consumed_B": 31.920750592, "train/loss_slope": -9.778598175376836e-08} {"step": 15225, "timestamp": 1778342167.4647927, "eos/sharpness": 3.4529685974121085, "eos/L0_probe": 2.3062362670898438, "eos/L_plus": 2.3231630325317383, "eos/L_minus": 2.3238391876220703, "eos/grad_norm": 0.08182661980390549, "eos/embed_grad_frac": 0.36870747804641724, "eos/time_s": 0.6020267009735107} {"step": 15225, "timestamp": 1778342168.847619, "geo/rankme_last": 429.30645751953125, "geo/layer_0/stable_rank_q_proj": 20.665592193603516, "geo/layer_0/stable_rank_k_proj": 16.814929962158203, "geo/layer_0/stable_rank_o_proj": 43.73653030395508, "geo/layer_0/stable_rank_gate_proj": 123.79723358154297, "geo/layer_0/stable_rank_down_proj": 57.93061065673828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06494121253490448, "geo/layer_0/attn_entropy_mean": 6.236631393432617, "geo/layer_0/attn_entropy_std": 0.4651159644126892, "geo/layer_7/stable_rank_q_proj": 41.58283996582031, "geo/layer_7/stable_rank_k_proj": 38.41401672363281, "geo/layer_7/stable_rank_o_proj": 87.50413513183594, "geo/layer_7/stable_rank_gate_proj": 77.40962982177734, "geo/layer_7/stable_rank_down_proj": 143.91600036621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39032799005508423, "geo/layer_7/attn_entropy_mean": 4.7466607093811035, "geo/layer_7/attn_entropy_std": 0.7628130912780762, "geo/layer_14/stable_rank_q_proj": 51.636993408203125, "geo/layer_14/stable_rank_k_proj": 44.36424255371094, "geo/layer_14/stable_rank_o_proj": 42.14577865600586, "geo/layer_14/stable_rank_gate_proj": 71.95198059082031, "geo/layer_14/stable_rank_down_proj": 126.69815063476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3782774806022644, "geo/layer_14/attn_entropy_mean": 5.535528182983398, "geo/layer_14/attn_entropy_std": 0.4994146227836609, "geo/layer_21/stable_rank_q_proj": 38.18058395385742, "geo/layer_21/stable_rank_k_proj": 28.56258773803711, "geo/layer_21/stable_rank_o_proj": 64.66190338134766, "geo/layer_21/stable_rank_gate_proj": 59.671566009521484, "geo/layer_21/stable_rank_down_proj": 48.563079833984375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13307052850723267, "geo/layer_21/attn_entropy_mean": 5.862335681915283, "geo/layer_21/attn_entropy_std": 0.33242061734199524, "geo/layer_27/stable_rank_q_proj": 45.45027160644531, "geo/layer_27/stable_rank_k_proj": 30.45725440979004, "geo/layer_27/stable_rank_o_proj": 106.73169708251953, "geo/layer_27/stable_rank_gate_proj": 69.02699279785156, "geo/layer_27/stable_rank_down_proj": 129.66452026367188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1021275743842125, "geo/layer_27/attn_entropy_mean": 4.325447082519531, "geo/layer_27/attn_entropy_std": 0.6916520595550537, "attnres/final_alpha/block_0": 0.264844685792923, "attnres/block_norm/0": 1.7843639850616455, "attnres/final_alpha/block_1": 0.0037055546417832375, "attnres/block_norm/1": 50890.34765625, "attnres/final_alpha/block_2": 0.00803911592811346, "attnres/block_norm/2": 30154.0, "attnres/final_alpha/block_3": 0.010364719666540623, "attnres/block_norm/3": 74515.078125, "attnres/final_alpha/block_4": 0.011817814782261848, "attnres/block_norm/4": 17819.150390625, "attnres/final_alpha/block_5": 0.6038693189620972, "attnres/block_norm/5": 7243.455078125, "attnres/final_alpha/block_6": 0.09735874086618423, "attnres/block_norm/6": 49827.01171875, "geo/tier1_time_s": 1.3609652519226074, "geo/step": 15225.0, "geo/rankme_slope": 6.645636770333133e-05} {"step": 15230, "timestamp": 1778342174.0278423, "train/loss": 2.3146594285964968, "train/z_loss": 0.0013741877395659686, "train/perplexity": 10.121475246657093, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701639.6452091776, "perf/iters_per_sec": 0.8114050127073181, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2324301481246949, "data/tokens_consumed": 31941722112, "data/tokens_consumed_B": 31.941722112, "train/loss_slope": -1.762854884846153e-06} {"step": 15240, "timestamp": 1778342184.3940437, "train/loss": 2.280611205101013, "train/z_loss": 0.0013667043996974826, "train/perplexity": 9.78265779298373, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024155.7798777341, "perf/iters_per_sec": 0.965192689837329, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360625505447387, "data/tokens_consumed": 31962693632, "data/tokens_consumed_B": 31.962693632, "train/loss_slope": -3.3806394107200245e-06} {"step": 15250, "timestamp": 1778342194.7320144, "grad/layer_0/attn": 0.002562751527875662, "grad/layer_0/mlp": 0.003085371572524309, "grad/layer_0/attn_mlp_ratio": 0.8306135532057104, "grad/layer_4/attn": 0.001893798471428454, "grad/layer_4/mlp": 0.0026165263261646032, "grad/layer_4/attn_mlp_ratio": 0.7237834300051196, "grad/layer_8/attn": 0.004148988518863916, "grad/layer_8/mlp": 0.003422425128519535, "grad/layer_8/attn_mlp_ratio": 1.2122948616349103, "grad/layer_12/attn": 0.006095854099839926, "grad/layer_12/mlp": 0.006652841344475746, "grad/layer_12/attn_mlp_ratio": 0.9162782775924534, "grad/layer_16/attn": 0.004151224158704281, "grad/layer_16/mlp": 0.0046864766627550125, "grad/layer_16/attn_mlp_ratio": 0.8857878463615649, "grad/layer_20/attn": 0.004805745091289282, "grad/layer_20/mlp": 0.005855530966073275, "grad/layer_20/attn_mlp_ratio": 0.8207189129494312, "grad/layer_24/attn": 0.008977017365396023, "grad/layer_24/mlp": 0.009750708937644958, "grad/layer_24/attn_mlp_ratio": 0.9206527782480317, "grad/layer_27/attn": 0.009092703461647034, "grad/layer_27/mlp": 0.007643821649253368, "grad/layer_27/attn_mlp_ratio": 1.189549385101136} {"step": 15250, "timestamp": 1778342194.7478437, "train/loss": 2.3651635885238647, "train/z_loss": 0.0013744481140747667, "train/perplexity": 10.645780202539543, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026550.5781121352, "perf/iters_per_sec": 0.9663346186218906, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348382234573363, "data/tokens_consumed": 31983665152, "data/tokens_consumed_B": 31.983665152, "train/loss_slope": -7.56589192511556e-07} {"step": 15260, "timestamp": 1778342205.0960398, "train/loss": 2.3434677839279177, "train/z_loss": 0.0013662206707522272, "train/perplexity": 10.417298940163349, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028037.7688315932, "perf/iters_per_sec": 0.9670437664182631, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034079360961914, "data/tokens_consumed": 32004636672, "data/tokens_consumed_B": 32.004636672, "train/loss_slope": -2.1621044855951017e-06} {"step": 15270, "timestamp": 1778342215.4539309, "train/loss": 2.325887656211853, "train/z_loss": 0.0013836398138664663, "train/perplexity": 10.235761892060339, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025820.607515434, "perf/iters_per_sec": 0.9659865415169878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352111101150512, "data/tokens_consumed": 32025608192, "data/tokens_consumed_B": 32.025608192, "train/loss_slope": -3.0038855364113478e-06} {"step": 15280, "timestamp": 1778342225.8076289, "train/loss": 2.3058666944503785, "train/z_loss": 0.001371793681755662, "train/perplexity": 10.032869918051047, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026537.831786882, "perf/iters_per_sec": 0.9663285407003793, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034844732284546, "data/tokens_consumed": 32046579712, "data/tokens_consumed_B": 32.046579712, "train/loss_slope": -1.2986198045311451e-06} {"step": 15290, "timestamp": 1778342236.1580453, "train/loss": 2.2909323453903196, "train/z_loss": 0.001377050031442195, "train/perplexity": 9.884148827117404, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027555.8948577244, "perf/iters_per_sec": 0.9668139910019514, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034325122833252, "data/tokens_consumed": 32067551232, "data/tokens_consumed_B": 32.067551232, "train/loss_slope": -2.7528970405357053e-06} {"step": 15300, "timestamp": 1778342246.498679, "grad/layer_0/attn": 0.0027017672546207905, "grad/layer_0/mlp": 0.003090274753049016, "grad/layer_0/attn_mlp_ratio": 0.8742805682656658, "grad/layer_4/attn": 0.002531114500015974, "grad/layer_4/mlp": 0.002734028035774827, "grad/layer_4/attn_mlp_ratio": 0.9257821698673386, "grad/layer_8/attn": 0.003881541546434164, "grad/layer_8/mlp": 0.0035946157295256853, "grad/layer_8/attn_mlp_ratio": 1.0798209685028675, "grad/layer_12/attn": 0.010001410730183125, "grad/layer_12/mlp": 0.0068940091878175735, "grad/layer_12/attn_mlp_ratio": 1.4507393756861702, "grad/layer_16/attn": 0.004054096527397633, "grad/layer_16/mlp": 0.0044478196650743484, "grad/layer_16/attn_mlp_ratio": 0.9114794981648421, "grad/layer_20/attn": 0.0025758109986782074, "grad/layer_20/mlp": 0.006026523187756538, "grad/layer_20/attn_mlp_ratio": 0.42741243594150136, "grad/layer_24/attn": 0.007766399532556534, "grad/layer_24/mlp": 0.008193723857402802, "grad/layer_24/attn_mlp_ratio": 0.9478473491335778, "grad/layer_27/attn": 0.0058233425952494144, "grad/layer_27/mlp": 0.007998529821634293, "grad/layer_27/attn_mlp_ratio": 0.728051611021487} {"step": 15300, "timestamp": 1778342247.1038935, "eos/sharpness": 46.567249298095696, "eos/L0_probe": 2.3058276176452637, "eos/L_plus": 2.5763204097747803, "eos/L_minus": 2.501007318496704, "eos/grad_norm": 0.1259123831987381, "eos/embed_grad_frac": 0.13842730224132538, "eos/time_s": 0.6024930477142334} {"step": 15300, "timestamp": 1778342247.1230292, "train/loss": 2.305192255973816, "train/z_loss": 0.0013840861618518829, "train/perplexity": 10.026105645847087, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913813.4710869924, "perf/iters_per_sec": 0.91257737688398, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0957974910736084, "data/tokens_consumed": 32088522752, "data/tokens_consumed_B": 32.088522752, "train/loss_slope": -4.57139085299831e-06} {"step": 15300, "timestamp": 1778342248.489968, "geo/rankme_last": 429.42742919921875, "geo/layer_0/stable_rank_q_proj": 20.62246322631836, "geo/layer_0/stable_rank_k_proj": 16.762710571289062, "geo/layer_0/stable_rank_o_proj": 43.724029541015625, "geo/layer_0/stable_rank_gate_proj": 123.72480010986328, "geo/layer_0/stable_rank_down_proj": 57.92812728881836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06349940598011017, "geo/layer_0/attn_entropy_mean": 6.238564491271973, "geo/layer_0/attn_entropy_std": 0.46540310978889465, "geo/layer_7/stable_rank_q_proj": 41.6544189453125, "geo/layer_7/stable_rank_k_proj": 38.44723892211914, "geo/layer_7/stable_rank_o_proj": 87.36741638183594, "geo/layer_7/stable_rank_gate_proj": 77.43721008300781, "geo/layer_7/stable_rank_down_proj": 143.62615966796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3876325488090515, "geo/layer_7/attn_entropy_mean": 4.733660697937012, "geo/layer_7/attn_entropy_std": 0.7877622842788696, "geo/layer_14/stable_rank_q_proj": 51.627159118652344, "geo/layer_14/stable_rank_k_proj": 44.359222412109375, "geo/layer_14/stable_rank_o_proj": 42.13987731933594, "geo/layer_14/stable_rank_gate_proj": 71.89627838134766, "geo/layer_14/stable_rank_down_proj": 126.6981430053711, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3876120448112488, "geo/layer_14/attn_entropy_mean": 5.515036582946777, "geo/layer_14/attn_entropy_std": 0.4683534502983093, "geo/layer_21/stable_rank_q_proj": 38.23405456542969, "geo/layer_21/stable_rank_k_proj": 28.518789291381836, "geo/layer_21/stable_rank_o_proj": 64.68109893798828, "geo/layer_21/stable_rank_gate_proj": 59.659420013427734, "geo/layer_21/stable_rank_down_proj": 48.47418975830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13303320109844208, "geo/layer_21/attn_entropy_mean": 5.857570648193359, "geo/layer_21/attn_entropy_std": 0.34339937567710876, "geo/layer_27/stable_rank_q_proj": 45.44000244140625, "geo/layer_27/stable_rank_k_proj": 30.44386863708496, "geo/layer_27/stable_rank_o_proj": 106.59657287597656, "geo/layer_27/stable_rank_gate_proj": 69.05218505859375, "geo/layer_27/stable_rank_down_proj": 129.63900756835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08872487396001816, "geo/layer_27/attn_entropy_mean": 4.307321548461914, "geo/layer_27/attn_entropy_std": 0.6990953683853149, "attnres/final_alpha/block_0": 0.26382994651794434, "attnres/block_norm/0": 1.7841579914093018, "attnres/final_alpha/block_1": 0.0036937170661985874, "attnres/block_norm/1": 50792.859375, "attnres/final_alpha/block_2": 0.008089570328593254, "attnres/block_norm/2": 30177.748046875, "attnres/final_alpha/block_3": 0.010255273431539536, "attnres/block_norm/3": 74423.7421875, "attnres/final_alpha/block_4": 0.01186644472181797, "attnres/block_norm/4": 17762.59375, "attnres/final_alpha/block_5": 0.6040253639221191, "attnres/block_norm/5": 7240.365234375, "attnres/final_alpha/block_6": 0.0982397049665451, "attnres/block_norm/6": 50005.3828125, "geo/tier1_time_s": 1.3630797863006592, "geo/step": 15300.0, "geo/rankme_slope": 6.569739223814526e-05} {"step": 15310, "timestamp": 1778342258.849533, "train/loss": 2.34778687953949, "train/z_loss": 0.0013682740507647395, "train/perplexity": 10.462389555544206, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788945.1481798755, "perf/iters_per_sec": 0.8530355206393602, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1722841262817383, "data/tokens_consumed": 32109494272, "data/tokens_consumed_B": 32.109494272, "train/loss_slope": -6.1824288698706795e-06} {"step": 15320, "timestamp": 1778342269.20098, "train/loss": 2.3186108589172365, "train/z_loss": 0.0013710420695133506, "train/perplexity": 10.16154867237053, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027331.678097983, "perf/iters_per_sec": 0.9667070761194148, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344395160675048, "data/tokens_consumed": 32130465792, "data/tokens_consumed_B": 32.130465792, "train/loss_slope": -4.8476130333599646e-06} {"step": 15330, "timestamp": 1778342279.5500886, "train/loss": 2.3183971643447876, "train/z_loss": 0.0013737752800807358, "train/perplexity": 10.159377436570486, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027711.4457448155, "perf/iters_per_sec": 0.9668881634449079, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034245777130127, "data/tokens_consumed": 32151437312, "data/tokens_consumed_B": 32.151437312, "train/loss_slope": -5.3308566339803955e-06} {"step": 15340, "timestamp": 1778342289.9106443, "train/loss": 2.316266655921936, "train/z_loss": 0.0013700299081392587, "train/perplexity": 10.137755838048315, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025168.3730819842, "perf/iters_per_sec": 0.9656755319032594, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035544514656067, "data/tokens_consumed": 32172408832, "data/tokens_consumed_B": 32.172408832, "train/loss_slope": -3.887051703369845e-06} {"step": 15350, "timestamp": 1778342300.2703788, "grad/layer_0/attn": 0.002890227362513542, "grad/layer_0/mlp": 0.0033733192831277847, "grad/layer_0/attn_mlp_ratio": 0.8567903107454019, "grad/layer_4/attn": 0.0022952579893171787, "grad/layer_4/mlp": 0.002546086208894849, "grad/layer_4/attn_mlp_ratio": 0.9014847537958977, "grad/layer_8/attn": 0.003831123001873493, "grad/layer_8/mlp": 0.003353626001626253, "grad/layer_8/attn_mlp_ratio": 1.142382270944185, "grad/layer_12/attn": 0.008854534476995468, "grad/layer_12/mlp": 0.0068641649559140205, "grad/layer_12/attn_mlp_ratio": 1.2899652623251807, "grad/layer_16/attn": 0.003455433761700988, "grad/layer_16/mlp": 0.004570064600557089, "grad/layer_16/attn_mlp_ratio": 0.7561017158640604, "grad/layer_20/attn": 0.0038925078697502613, "grad/layer_20/mlp": 0.005848982371389866, "grad/layer_20/attn_mlp_ratio": 0.6655017156899264, "grad/layer_24/attn": 0.008557611145079136, "grad/layer_24/mlp": 0.008017581887543201, "grad/layer_24/attn_mlp_ratio": 1.067355613996211, "grad/layer_27/attn": 0.00854307971894741, "grad/layer_27/mlp": 0.008003393188118935, "grad/layer_27/attn_mlp_ratio": 1.0674322017424338} {"step": 15350, "timestamp": 1778342300.2863321, "train/loss": 2.3814342975616456, "train/z_loss": 0.0013648156193085015, "train/perplexity": 10.820411429052685, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022691.8566422418, "perf/iters_per_sec": 0.9644946368418893, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368124008178712, "data/tokens_consumed": 32193380352, "data/tokens_consumed_B": 32.193380352, "train/loss_slope": -1.5195173005460174e-06} {"step": 15360, "timestamp": 1778342310.6367908, "train/loss": 2.330351161956787, "train/z_loss": 0.0013864083215594293, "train/perplexity": 10.28155138888886, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027328.0801924698, "perf/iters_per_sec": 0.9667053605043744, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034441351890564, "data/tokens_consumed": 32214351872, "data/tokens_consumed_B": 32.214351872, "train/loss_slope": -6.068907519414594e-07} {"step": 15370, "timestamp": 1778342320.992839, "train/loss": 2.3081768274307253, "train/z_loss": 0.0013702709111385047, "train/perplexity": 10.056073973643866, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026014.763817915, "perf/iters_per_sec": 0.9660791224565101, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035111904144287, "data/tokens_consumed": 32235323392, "data/tokens_consumed_B": 32.235323392, "train/loss_slope": -3.732139485540115e-07} {"step": 15375, "timestamp": 1778342326.7605407, "eos/sharpness": 61.329340934753404, "eos/L0_probe": 2.3061599731445312, "eos/L_plus": 2.656134843826294, "eos/L_minus": 2.5694785118103027, "eos/grad_norm": 0.20813600718975067, "eos/embed_grad_frac": 0.061736904084682465, "eos/time_s": 0.6037800312042236} {"step": 15375, "timestamp": 1778342328.1428092, "geo/rankme_last": 430.3833312988281, "geo/layer_0/stable_rank_q_proj": 20.61018943786621, "geo/layer_0/stable_rank_k_proj": 16.762277603149414, "geo/layer_0/stable_rank_o_proj": 43.720252990722656, "geo/layer_0/stable_rank_gate_proj": 123.445068359375, "geo/layer_0/stable_rank_down_proj": 57.872955322265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06519829481840134, "geo/layer_0/attn_entropy_mean": 6.230722904205322, "geo/layer_0/attn_entropy_std": 0.4664604961872101, "geo/layer_7/stable_rank_q_proj": 41.65610885620117, "geo/layer_7/stable_rank_k_proj": 38.472660064697266, "geo/layer_7/stable_rank_o_proj": 87.44530487060547, "geo/layer_7/stable_rank_gate_proj": 77.30464172363281, "geo/layer_7/stable_rank_down_proj": 143.8895263671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3932589590549469, "geo/layer_7/attn_entropy_mean": 4.714765548706055, "geo/layer_7/attn_entropy_std": 0.765442430973053, "geo/layer_14/stable_rank_q_proj": 51.56367492675781, "geo/layer_14/stable_rank_k_proj": 44.28342056274414, "geo/layer_14/stable_rank_o_proj": 42.172508239746094, "geo/layer_14/stable_rank_gate_proj": 71.88446807861328, "geo/layer_14/stable_rank_down_proj": 126.76464080810547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3684249818325043, "geo/layer_14/attn_entropy_mean": 5.528556823730469, "geo/layer_14/attn_entropy_std": 0.48425984382629395, "geo/layer_21/stable_rank_q_proj": 38.198490142822266, "geo/layer_21/stable_rank_k_proj": 28.513551712036133, "geo/layer_21/stable_rank_o_proj": 64.64213562011719, "geo/layer_21/stable_rank_gate_proj": 59.62892150878906, "geo/layer_21/stable_rank_down_proj": 48.430519104003906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13628171384334564, "geo/layer_21/attn_entropy_mean": 5.836126327514648, "geo/layer_21/attn_entropy_std": 0.33575335144996643, "geo/layer_27/stable_rank_q_proj": 45.40955352783203, "geo/layer_27/stable_rank_k_proj": 30.430463790893555, "geo/layer_27/stable_rank_o_proj": 106.63534545898438, "geo/layer_27/stable_rank_gate_proj": 68.9637451171875, "geo/layer_27/stable_rank_down_proj": 129.7342987060547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10497470200061798, "geo/layer_27/attn_entropy_mean": 4.3158087730407715, "geo/layer_27/attn_entropy_std": 0.7066792249679565, "attnres/final_alpha/block_0": 0.2633568346500397, "attnres/block_norm/0": 1.7843053340911865, "attnres/final_alpha/block_1": 0.003725865390151739, "attnres/block_norm/1": 50689.3203125, "attnres/final_alpha/block_2": 0.007963098585605621, "attnres/block_norm/2": 30085.978515625, "attnres/final_alpha/block_3": 0.010134436190128326, "attnres/block_norm/3": 73950.734375, "attnres/final_alpha/block_4": 0.011827129870653152, "attnres/block_norm/4": 17797.5234375, "attnres/final_alpha/block_5": 0.605445921421051, "attnres/block_norm/5": 7244.4921875, "attnres/final_alpha/block_6": 0.09754671156406403, "attnres/block_norm/6": 49800.13671875, "geo/tier1_time_s": 1.3605411052703857, "geo/step": 15375.0, "geo/rankme_slope": 7.562644198304322e-05} {"step": 15380, "timestamp": 1778342333.3199058, "train/loss": 2.3344335556030273, "train/z_loss": 0.0013518283725716175, "train/perplexity": 10.323610521507346, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702115.3905166974, "perf/iters_per_sec": 0.8116318657477843, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.232085680961609, "data/tokens_consumed": 32256294912, "data/tokens_consumed_B": 32.256294912, "train/loss_slope": -1.2670161879316846e-06} {"step": 15390, "timestamp": 1778342343.6662958, "train/loss": 2.3275171279907227, "train/z_loss": 0.0013515862985514105, "train/perplexity": 10.25245437346891, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027930.6971023353, "perf/iters_per_sec": 0.9669927106391598, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0341339588165284, "data/tokens_consumed": 32277266432, "data/tokens_consumed_B": 32.277266432, "train/loss_slope": -2.6344825225015467e-06} {"step": 15400, "timestamp": 1778342354.011824, "grad/layer_0/attn": 0.003185712732374668, "grad/layer_0/mlp": 0.0036958048585802317, "grad/layer_0/attn_mlp_ratio": 0.8619807506288105, "grad/layer_4/attn": 0.0027667731046676636, "grad/layer_4/mlp": 0.002726305276155472, "grad/layer_4/attn_mlp_ratio": 1.0148434320183375, "grad/layer_8/attn": 0.007015439681708813, "grad/layer_8/mlp": 0.003530432004481554, "grad/layer_8/attn_mlp_ratio": 1.9871334369533313, "grad/layer_12/attn": 0.005937227979302406, "grad/layer_12/mlp": 0.006901760585606098, "grad/layer_12/attn_mlp_ratio": 0.8602483119538955, "grad/layer_16/attn": 0.0034992669243365526, "grad/layer_16/mlp": 0.005097394809126854, "grad/layer_16/attn_mlp_ratio": 0.6864814256535504, "grad/layer_20/attn": 0.00549229746684432, "grad/layer_20/mlp": 0.006598588079214096, "grad/layer_20/attn_mlp_ratio": 0.8323443314958417, "grad/layer_24/attn": 0.013036561198532581, "grad/layer_24/mlp": 0.013929345645010471, "grad/layer_24/attn_mlp_ratio": 0.9359062110438541, "grad/layer_27/attn": 0.006543754134327173, "grad/layer_27/mlp": 0.013036203570663929, "grad/layer_27/attn_mlp_ratio": 0.5019677737202691} {"step": 15400, "timestamp": 1778342354.027643, "train/loss": 2.380313444137573, "train/z_loss": 0.0013715210370719433, "train/perplexity": 10.808290128222339, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025511.928496777, "perf/iters_per_sec": 0.9658393518909345, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353688716888427, "data/tokens_consumed": 32298237952, "data/tokens_consumed_B": 32.298237952, "train/loss_slope": -2.8997428274856078e-06} {"step": 15410, "timestamp": 1778342364.3851604, "train/loss": 2.295921730995178, "train/z_loss": 0.0013711956911720336, "train/perplexity": 9.93358788970413, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026175.585701138, "perf/iters_per_sec": 0.966155808306283, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350297451019288, "data/tokens_consumed": 32319209472, "data/tokens_consumed_B": 32.319209472, "train/loss_slope": -5.709479672752169e-06} {"step": 15420, "timestamp": 1778342374.7406735, "train/loss": 2.370152735710144, "train/z_loss": 0.0013736734632402658, "train/perplexity": 10.699026282700045, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026755.0548197343, "perf/iters_per_sec": 0.9664321207140609, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034733819961548, "data/tokens_consumed": 32340180992, "data/tokens_consumed_B": 32.340180992, "train/loss_slope": -3.441951727674035e-06} {"step": 15430, "timestamp": 1778342385.0883427, "train/loss": 2.3230637550354003, "train/z_loss": 0.0013569267932325602, "train/perplexity": 10.206897885743292, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028117.1681825814, "perf/iters_per_sec": 0.9670816269791515, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0340388774871827, "data/tokens_consumed": 32361152512, "data/tokens_consumed_B": 32.361152512, "train/loss_slope": -7.511899165838725e-07} {"step": 15440, "timestamp": 1778342395.4400272, "train/loss": 2.3563366889953614, "train/z_loss": 0.0013619529083371163, "train/perplexity": 10.552224481207583, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027380.7884487167, "perf/iters_per_sec": 0.966730493759497, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344144582748414, "data/tokens_consumed": 32382124032, "data/tokens_consumed_B": 32.382124032, "train/loss_slope": -9.090204264643923e-07} {"step": 15450, "timestamp": 1778342405.786665, "grad/layer_0/attn": 0.002710258588194847, "grad/layer_0/mlp": 0.002989912172779441, "grad/layer_0/attn_mlp_ratio": 0.90646759534365, "grad/layer_4/attn": 0.00219902815297246, "grad/layer_4/mlp": 0.002644642721861601, "grad/layer_4/attn_mlp_ratio": 0.8315028913524637, "grad/layer_8/attn": 0.0037523380015045404, "grad/layer_8/mlp": 0.003461617510765791, "grad/layer_8/attn_mlp_ratio": 1.083983970336468, "grad/layer_12/attn": 0.007416066713631153, "grad/layer_12/mlp": 0.0072004725225269794, "grad/layer_12/attn_mlp_ratio": 1.0299416583336043, "grad/layer_16/attn": 0.00347504741512239, "grad/layer_16/mlp": 0.004625493194907904, "grad/layer_16/attn_mlp_ratio": 0.7512814728210692, "grad/layer_20/attn": 0.0024089065846055746, "grad/layer_20/mlp": 0.005144746974110603, "grad/layer_20/attn_mlp_ratio": 0.4682264356060718, "grad/layer_24/attn": 0.004815326537936926, "grad/layer_24/mlp": 0.007150374352931976, "grad/layer_24/attn_mlp_ratio": 0.673436974473753, "grad/layer_27/attn": 0.004486485384404659, "grad/layer_27/mlp": 0.007124024908989668, "grad/layer_27/attn_mlp_ratio": 0.6297683372452022} {"step": 15450, "timestamp": 1778342406.39782, "eos/sharpness": 22.23265171051025, "eos/L0_probe": 2.3037009239196777, "eos/L_plus": 2.431699275970459, "eos/L_minus": 2.398029088973999, "eos/grad_norm": 0.1046760082244873, "eos/embed_grad_frac": 0.24046313762664795, "eos/time_s": 0.6082911491394043} {"step": 15450, "timestamp": 1778342406.4274187, "train/loss": 2.308363509178162, "train/z_loss": 0.0013527449569664895, "train/perplexity": 10.057951434343986, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909780.338981991, "perf/iters_per_sec": 0.9106542296323733, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.098111629486084, "data/tokens_consumed": 32403095552, "data/tokens_consumed_B": 32.403095552, "train/loss_slope": -2.990223787012259e-07} {"step": 15450, "timestamp": 1778342407.789539, "geo/rankme_last": 430.1964111328125, "geo/layer_0/stable_rank_q_proj": 20.593334197998047, "geo/layer_0/stable_rank_k_proj": 16.774850845336914, "geo/layer_0/stable_rank_o_proj": 43.76026153564453, "geo/layer_0/stable_rank_gate_proj": 123.5163345336914, "geo/layer_0/stable_rank_down_proj": 57.84117889404297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0643785297870636, "geo/layer_0/attn_entropy_mean": 6.226627349853516, "geo/layer_0/attn_entropy_std": 0.46671199798583984, "geo/layer_7/stable_rank_q_proj": 41.664371490478516, "geo/layer_7/stable_rank_k_proj": 38.540283203125, "geo/layer_7/stable_rank_o_proj": 87.26667022705078, "geo/layer_7/stable_rank_gate_proj": 77.41925811767578, "geo/layer_7/stable_rank_down_proj": 144.0079345703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39627665281295776, "geo/layer_7/attn_entropy_mean": 4.732805252075195, "geo/layer_7/attn_entropy_std": 0.7865418791770935, "geo/layer_14/stable_rank_q_proj": 51.64725875854492, "geo/layer_14/stable_rank_k_proj": 44.28840637207031, "geo/layer_14/stable_rank_o_proj": 42.22543716430664, "geo/layer_14/stable_rank_gate_proj": 71.88507080078125, "geo/layer_14/stable_rank_down_proj": 126.64501190185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37832459807395935, "geo/layer_14/attn_entropy_mean": 5.51591682434082, "geo/layer_14/attn_entropy_std": 0.512222945690155, "geo/layer_21/stable_rank_q_proj": 38.22555923461914, "geo/layer_21/stable_rank_k_proj": 28.555618286132812, "geo/layer_21/stable_rank_o_proj": 64.68331146240234, "geo/layer_21/stable_rank_gate_proj": 59.58858871459961, "geo/layer_21/stable_rank_down_proj": 48.43193054199219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13657867908477783, "geo/layer_21/attn_entropy_mean": 5.855896949768066, "geo/layer_21/attn_entropy_std": 0.3393944203853607, "geo/layer_27/stable_rank_q_proj": 45.403324127197266, "geo/layer_27/stable_rank_k_proj": 30.39889144897461, "geo/layer_27/stable_rank_o_proj": 106.67768859863281, "geo/layer_27/stable_rank_gate_proj": 68.99373626708984, "geo/layer_27/stable_rank_down_proj": 129.72286987304688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10205670446157455, "geo/layer_27/attn_entropy_mean": 4.299383163452148, "geo/layer_27/attn_entropy_std": 0.7067750692367554, "attnres/final_alpha/block_0": 0.26348692178726196, "attnres/block_norm/0": 1.784536361694336, "attnres/final_alpha/block_1": 0.0036926036700606346, "attnres/block_norm/1": 50625.05859375, "attnres/final_alpha/block_2": 0.008021378889679909, "attnres/block_norm/2": 30004.578125, "attnres/final_alpha/block_3": 0.010156583040952682, "attnres/block_norm/3": 74023.109375, "attnres/final_alpha/block_4": 0.011813006363809109, "attnres/block_norm/4": 17927.3359375, "attnres/final_alpha/block_5": 0.6051867008209229, "attnres/block_norm/5": 7267.7255859375, "attnres/final_alpha/block_6": 0.09764277189970016, "attnres/block_norm/6": 49920.9453125, "geo/tier1_time_s": 1.3580193519592285, "geo/step": 15450.0, "geo/rankme_slope": 5.3558650022509e-05} {"step": 15460, "timestamp": 1778342418.1607745, "train/loss": 2.323378157615662, "train/z_loss": 0.0013690499239601196, "train/perplexity": 10.210107465298643, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787896.7907872498, "perf/iters_per_sec": 0.8525356248794793, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.172971510887146, "data/tokens_consumed": 32424067072, "data/tokens_consumed_B": 32.424067072, "train/loss_slope": 2.222349405503275e-06} {"step": 15470, "timestamp": 1778342428.525769, "train/loss": 2.314882779121399, "train/z_loss": 0.0013735970482230187, "train/perplexity": 10.123736135942222, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024676.9095931014, "perf/iters_per_sec": 0.9654411838498599, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357958793640136, "data/tokens_consumed": 32445038592, "data/tokens_consumed_B": 32.445038592, "train/loss_slope": 2.883886954750534e-07} {"step": 15480, "timestamp": 1778342438.8827739, "train/loss": 2.3740258693695067, "train/z_loss": 0.0013819379499182105, "train/perplexity": 10.740545394149102, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025801.3852198625, "perf/iters_per_sec": 0.9659773756121933, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352209329605102, "data/tokens_consumed": 32466010112, "data/tokens_consumed_B": 32.466010112, "train/loss_slope": 1.8024792658327443e-06} {"step": 15490, "timestamp": 1778342449.2474368, "train/loss": 2.313347053527832, "train/z_loss": 0.00136241284199059, "train/perplexity": 10.108200787325261, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024822.2773876358, "perf/iters_per_sec": 0.9655105006158999, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035721516609192, "data/tokens_consumed": 32486981632, "data/tokens_consumed_B": 32.486981632, "train/loss_slope": 1.0356606930681048e-07} {"step": 15500, "timestamp": 1778342459.6065865, "grad/layer_0/attn": 0.0035978967789560556, "grad/layer_0/mlp": 0.003746494883671403, "grad/layer_0/attn_mlp_ratio": 0.9603367399761663, "grad/layer_4/attn": 0.002130395732820034, "grad/layer_4/mlp": 0.0026055583730340004, "grad/layer_4/attn_mlp_ratio": 0.8176349734110281, "grad/layer_8/attn": 0.006885947193950415, "grad/layer_8/mlp": 0.0036454605869948864, "grad/layer_8/attn_mlp_ratio": 1.8889100130789789, "grad/layer_12/attn": 0.010595531202852726, "grad/layer_12/mlp": 0.00713031692430377, "grad/layer_12/attn_mlp_ratio": 1.485983185142783, "grad/layer_16/attn": 0.004161390010267496, "grad/layer_16/mlp": 0.004731594119220972, "grad/layer_16/attn_mlp_ratio": 0.8794900444680659, "grad/layer_20/attn": 0.002981319325044751, "grad/layer_20/mlp": 0.005946289282292128, "grad/layer_20/attn_mlp_ratio": 0.5013747453871034, "grad/layer_24/attn": 0.00898582674562931, "grad/layer_24/mlp": 0.008495277725160122, "grad/layer_24/attn_mlp_ratio": 1.057743717223273, "grad/layer_27/attn": 0.005396484863013029, "grad/layer_27/mlp": 0.00891465600579977, "grad/layer_27/attn_mlp_ratio": 0.6053497520226427} {"step": 15500, "timestamp": 1778342459.622516, "train/loss": 2.330711507797241, "train/z_loss": 0.0013799536507576704, "train/perplexity": 10.285256970770671, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023087.708919489, "perf/iters_per_sec": 0.9646833939168401, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0366095304489136, "data/tokens_consumed": 32507953152, "data/tokens_consumed_B": 32.507953152, "train/loss_slope": -7.06419456909995e-07} {"step": 15500, "timestamp": 1778342466.6680365, "geo/ww_alpha_mean": 7.826090447632897, "geo/ww_alpha_std": 4.874598310875069, "geo/ww_alpha_min": 1.331400212260947, "geo/ww_alpha_max": 32.101121293802265, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.855471419702604, "geo/ww_alpha_by_type/k_proj": 4.581210497119896, "geo/ww_alpha_by_type/v_proj": 9.635040566193299, "geo/ww_alpha_by_type/o_proj": 8.693551257223978, "geo/ww_alpha_by_type/gate_proj": 7.659127532561092, "geo/ww_alpha_by_type/up_proj": 12.211826357472146, "geo/ww_alpha_by_type/down_proj": 8.251504426965115, "geo/twonn_id/layer_0": 0.7566030621528625, "geo/twonn_id/layer_7": 3.827247142791748, "geo/twonn_id/layer_14": 5.356969833374023, "geo/twonn_id/layer_21": 7.593842029571533, "geo/twonn_id/layer_27": 6.622809410095215, "geo/tier2_time_s": 7.0358006954193115} {"step": 15500, "timestamp": 1778342467.4267147, "eoc/jacobian_sigma/layer_0/attn": 1396.6119384765625, "eoc/jacobian_sigma/layer_0/mlp": 11490.38671875, "eoc/jacobian_sigma/layer_0": 11490.38671875, "eoc/jacobian_sigma/layer_7/attn": 1.1309444904327393, "eoc/jacobian_sigma/layer_7/mlp": 1.8135813474655151, "eoc/jacobian_sigma/layer_7": 1.8135813474655151, "eoc/jacobian_sigma/layer_14/attn": 2.272784471511841, "eoc/jacobian_sigma/layer_14/mlp": 13.164473533630371, "eoc/jacobian_sigma/layer_14": 13.164473533630371, "eoc/jacobian_sigma/layer_21/attn": 1.0916821956634521, "eoc/jacobian_sigma/layer_21/mlp": 5.556040287017822, "eoc/jacobian_sigma/layer_21": 5.556040287017822, "eoc/jacobian_sigma/layer_27/attn": 3.9280340671539307, "eoc/jacobian_sigma/layer_27/mlp": 53.65420150756836, "eoc/jacobian_sigma/layer_27": 53.65420150756836, "eoc/layer0_sigma": 11490.38671875, "eoc/sigma_max": 53.65420150756836, "eoc/sigma_min": 1.8135813474655151, "eoc/sigma_mean": 18.547074168920517, "eoc/time_s": 0.7486374378204346} {"step": 15510, "timestamp": 1778342477.8025105, "train/loss": 2.2980802774429323, "train/z_loss": 0.0013767710304819047, "train/perplexity": 9.955053159112955, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1154046.5595669604, "perf/iters_per_sec": 0.5502922818980028, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8172161102294921, "data/tokens_consumed": 32528924672, "data/tokens_consumed_B": 32.528924672, "train/loss_slope": -4.5937836581510396e-07} {"step": 15520, "timestamp": 1778342488.1726563, "train/loss": 2.3014501333236694, "train/z_loss": 0.0013639076496474445, "train/perplexity": 9.988656841527563, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024539.6240076558, "perf/iters_per_sec": 0.9653757209814338, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035866117477417, "data/tokens_consumed": 32549896192, "data/tokens_consumed_B": 32.549896192, "train/loss_slope": -1.3042909620475596e-06} {"step": 15525, "timestamp": 1778342493.9604704, "eos/sharpness": 57.67555236816405, "eos/L0_probe": 2.3057212829589844, "eos/L_plus": 2.639425277709961, "eos/L_minus": 2.5487728118896484, "eos/grad_norm": 0.1758081614971161, "eos/embed_grad_frac": 0.0779368206858635, "eos/time_s": 0.6184492111206055} {"step": 15525, "timestamp": 1778342495.3483584, "geo/rankme_last": 430.90863037109375, "geo/layer_0/stable_rank_q_proj": 20.592159271240234, "geo/layer_0/stable_rank_k_proj": 16.744613647460938, "geo/layer_0/stable_rank_o_proj": 43.768402099609375, "geo/layer_0/stable_rank_gate_proj": 123.48139190673828, "geo/layer_0/stable_rank_down_proj": 57.858394622802734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06798423826694489, "geo/layer_0/attn_entropy_mean": 6.232062339782715, "geo/layer_0/attn_entropy_std": 0.4655144512653351, "geo/layer_7/stable_rank_q_proj": 41.65757369995117, "geo/layer_7/stable_rank_k_proj": 38.6832275390625, "geo/layer_7/stable_rank_o_proj": 87.28169250488281, "geo/layer_7/stable_rank_gate_proj": 77.3616943359375, "geo/layer_7/stable_rank_down_proj": 144.01766967773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38303205370903015, "geo/layer_7/attn_entropy_mean": 4.768037796020508, "geo/layer_7/attn_entropy_std": 0.7633181214332581, "geo/layer_14/stable_rank_q_proj": 51.656349182128906, "geo/layer_14/stable_rank_k_proj": 44.37709426879883, "geo/layer_14/stable_rank_o_proj": 42.21131896972656, "geo/layer_14/stable_rank_gate_proj": 71.89735412597656, "geo/layer_14/stable_rank_down_proj": 126.80957794189453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3731062114238739, "geo/layer_14/attn_entropy_mean": 5.492220878601074, "geo/layer_14/attn_entropy_std": 0.48347339034080505, "geo/layer_21/stable_rank_q_proj": 38.22205352783203, "geo/layer_21/stable_rank_k_proj": 28.5662841796875, "geo/layer_21/stable_rank_o_proj": 64.57540893554688, "geo/layer_21/stable_rank_gate_proj": 59.55393981933594, "geo/layer_21/stable_rank_down_proj": 48.421749114990234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1329827606678009, "geo/layer_21/attn_entropy_mean": 5.858475685119629, "geo/layer_21/attn_entropy_std": 0.33197060227394104, "geo/layer_27/stable_rank_q_proj": 45.41925048828125, "geo/layer_27/stable_rank_k_proj": 30.43122100830078, "geo/layer_27/stable_rank_o_proj": 106.6028823852539, "geo/layer_27/stable_rank_gate_proj": 68.96308898925781, "geo/layer_27/stable_rank_down_proj": 129.6268768310547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09683161973953247, "geo/layer_27/attn_entropy_mean": 4.3341498374938965, "geo/layer_27/attn_entropy_std": 0.6889030933380127, "attnres/final_alpha/block_0": 0.26124051213264465, "attnres/block_norm/0": 1.7846421003341675, "attnres/final_alpha/block_1": 0.0036594902630895376, "attnres/block_norm/1": 50702.78125, "attnres/final_alpha/block_2": 0.008039450272917747, "attnres/block_norm/2": 30118.953125, "attnres/final_alpha/block_3": 0.010206131264567375, "attnres/block_norm/3": 74625.8515625, "attnres/final_alpha/block_4": 0.011624744161963463, "attnres/block_norm/4": 17826.70703125, "attnres/final_alpha/block_5": 0.6091259717941284, "attnres/block_norm/5": 7190.4287109375, "attnres/final_alpha/block_6": 0.0961037278175354, "attnres/block_norm/6": 49514.421875, "geo/tier1_time_s": 1.3600504398345947, "geo/step": 15525.0, "geo/rankme_slope": 5.685856373799519e-05} {"step": 15530, "timestamp": 1778342500.5398355, "train/loss": 2.296812558174133, "train/z_loss": 0.0013849915005266667, "train/perplexity": 9.942440942465366, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696903.7182668035, "perf/iters_per_sec": 0.8091467467626589, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2358697652816772, "data/tokens_consumed": 32570867712, "data/tokens_consumed_B": 32.570867712, "train/loss_slope": -2.783974476701831e-06} {"step": 15540, "timestamp": 1778342510.9169056, "train/loss": 2.2986754417419433, "train/z_loss": 0.0013657996314577758, "train/perplexity": 9.960979814840021, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022001.1029876382, "perf/iters_per_sec": 0.9641652598322097, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371665954589844, "data/tokens_consumed": 32591839232, "data/tokens_consumed_B": 32.591839232, "train/loss_slope": -4.6831290213295655e-06} {"step": 15550, "timestamp": 1778342521.2799609, "grad/layer_0/attn": 0.0026770769618451595, "grad/layer_0/mlp": 0.0030078876297920942, "grad/layer_0/attn_mlp_ratio": 0.890018911054968, "grad/layer_4/attn": 0.0016849680105224252, "grad/layer_4/mlp": 0.0026727118529379368, "grad/layer_4/attn_mlp_ratio": 0.6304338215984144, "grad/layer_8/attn": 0.0028775702230632305, "grad/layer_8/mlp": 0.003381660208106041, "grad/layer_8/attn_mlp_ratio": 0.8509341450309256, "grad/layer_12/attn": 0.012154491618275642, "grad/layer_12/mlp": 0.00720873661339283, "grad/layer_12/attn_mlp_ratio": 1.6860778942993282, "grad/layer_16/attn": 0.005728947464376688, "grad/layer_16/mlp": 0.005291725508868694, "grad/layer_16/attn_mlp_ratio": 1.0826236822966084, "grad/layer_20/attn": 0.003964018542319536, "grad/layer_20/mlp": 0.0070103127509355545, "grad/layer_20/attn_mlp_ratio": 0.5654552980171951, "grad/layer_24/attn": 0.012497716583311558, "grad/layer_24/mlp": 0.01220702100545168, "grad/layer_24/attn_mlp_ratio": 1.0238137933365294, "grad/layer_27/attn": 0.00981816090643406, "grad/layer_27/mlp": 0.012894690968096256, "grad/layer_27/attn_mlp_ratio": 0.761411099698691} {"step": 15550, "timestamp": 1778342521.2955084, "train/loss": 2.349275302886963, "train/z_loss": 0.001378079259302467, "train/perplexity": 10.477973615391218, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021878.4013004613, "perf/iters_per_sec": 0.9641067511083895, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372295379638672, "data/tokens_consumed": 32612810752, "data/tokens_consumed_B": 32.612810752, "train/loss_slope": -2.4739284326534427e-06} {"step": 15560, "timestamp": 1778342531.672605, "train/loss": 2.331293821334839, "train/z_loss": 0.0013672796543687583, "train/perplexity": 10.291247959289993, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022129.2584889508, "perf/iters_per_sec": 0.9642263691372637, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037100863456726, "data/tokens_consumed": 32633782272, "data/tokens_consumed_B": 32.633782272, "train/loss_slope": -3.0073029552176864e-06} {"step": 15570, "timestamp": 1778342542.044744, "train/loss": 2.371847724914551, "train/z_loss": 0.0013702937634661795, "train/perplexity": 10.717176394523266, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023265.657976814, "perf/iters_per_sec": 0.9647682466396399, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036518359184265, "data/tokens_consumed": 32654753792, "data/tokens_consumed_B": 32.654753792, "train/loss_slope": -1.6627972668940249e-06} {"step": 15580, "timestamp": 1778342552.414051, "train/loss": 2.3596881151199343, "train/z_loss": 0.0013694358174689114, "train/perplexity": 10.58764880985917, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023837.6429562957, "perf/iters_per_sec": 0.9650409903317908, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362254142761231, "data/tokens_consumed": 32675725312, "data/tokens_consumed_B": 32.675725312, "train/loss_slope": -4.868257736036367e-07} {"step": 15590, "timestamp": 1778342562.7915282, "train/loss": 2.317689895629883, "train/z_loss": 0.001372974447440356, "train/perplexity": 10.152194567155522, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021818.1713762856, "perf/iters_per_sec": 0.9640780312425068, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372604370117187, "data/tokens_consumed": 32696696832, "data/tokens_consumed_B": 32.696696832, "train/loss_slope": -2.2567950602661835e-06} {"step": 15600, "timestamp": 1778342573.158585, "grad/layer_0/attn": 0.003366780001670122, "grad/layer_0/mlp": 0.0035501038655638695, "grad/layer_0/attn_mlp_ratio": 0.948360959095284, "grad/layer_4/attn": 0.0026832730509340763, "grad/layer_4/mlp": 0.0027249502018094063, "grad/layer_4/attn_mlp_ratio": 0.9847053170666431, "grad/layer_8/attn": 0.003498005447909236, "grad/layer_8/mlp": 0.0035840384662151337, "grad/layer_8/attn_mlp_ratio": 0.9759954819914922, "grad/layer_12/attn": 0.005117699038237333, "grad/layer_12/mlp": 0.007133960258215666, "grad/layer_12/attn_mlp_ratio": 0.7173713871767804, "grad/layer_16/attn": 0.004073809832334518, "grad/layer_16/mlp": 0.005069420672953129, "grad/layer_16/attn_mlp_ratio": 0.8036045960259423, "grad/layer_20/attn": 0.004884605761617422, "grad/layer_20/mlp": 0.006222526077181101, "grad/layer_20/attn_mlp_ratio": 0.7849875794062506, "grad/layer_24/attn": 0.005586864892393351, "grad/layer_24/mlp": 0.008562544360756874, "grad/layer_24/attn_mlp_ratio": 0.6524771833884887, "grad/layer_27/attn": 0.005189861636608839, "grad/layer_27/mlp": 0.0073307896964251995, "grad/layer_27/attn_mlp_ratio": 0.7079539559488711} {"step": 15600, "timestamp": 1778342573.7717986, "eos/sharpness": 7.192802429199217, "eos/L0_probe": 2.3067069053649902, "eos/L_plus": 2.347548723220825, "eos/L_minus": 2.3377931118011475, "eos/grad_norm": 0.09234675019979477, "eos/embed_grad_frac": 0.3127545416355133, "eos/time_s": 0.6104185581207275} {"step": 15600, "timestamp": 1778342573.8059893, "train/loss": 2.3405691385269165, "train/z_loss": 0.0013587148627266289, "train/perplexity": 10.387146606073879, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905370.2587862287, "perf/iters_per_sec": 0.9085513395243782, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1006532669067384, "data/tokens_consumed": 32717668352, "data/tokens_consumed_B": 32.717668352, "train/loss_slope": -6.453687923647334e-07} {"step": 15600, "timestamp": 1778342575.1784973, "geo/rankme_last": 430.117431640625, "geo/layer_0/stable_rank_q_proj": 20.5914363861084, "geo/layer_0/stable_rank_k_proj": 16.745973587036133, "geo/layer_0/stable_rank_o_proj": 43.73870849609375, "geo/layer_0/stable_rank_gate_proj": 123.3493423461914, "geo/layer_0/stable_rank_down_proj": 57.89291000366211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06984122097492218, "geo/layer_0/attn_entropy_mean": 6.228974342346191, "geo/layer_0/attn_entropy_std": 0.46670883893966675, "geo/layer_7/stable_rank_q_proj": 41.609771728515625, "geo/layer_7/stable_rank_k_proj": 38.61107635498047, "geo/layer_7/stable_rank_o_proj": 87.21759796142578, "geo/layer_7/stable_rank_gate_proj": 77.33723449707031, "geo/layer_7/stable_rank_down_proj": 143.90965270996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38015806674957275, "geo/layer_7/attn_entropy_mean": 4.746259689331055, "geo/layer_7/attn_entropy_std": 0.7679674029350281, "geo/layer_14/stable_rank_q_proj": 51.66292190551758, "geo/layer_14/stable_rank_k_proj": 44.319820404052734, "geo/layer_14/stable_rank_o_proj": 42.175537109375, "geo/layer_14/stable_rank_gate_proj": 71.88668823242188, "geo/layer_14/stable_rank_down_proj": 126.96123504638672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36756598949432373, "geo/layer_14/attn_entropy_mean": 5.53586483001709, "geo/layer_14/attn_entropy_std": 0.45710787177085876, "geo/layer_21/stable_rank_q_proj": 38.205936431884766, "geo/layer_21/stable_rank_k_proj": 28.410799026489258, "geo/layer_21/stable_rank_o_proj": 64.53252410888672, "geo/layer_21/stable_rank_gate_proj": 59.53862380981445, "geo/layer_21/stable_rank_down_proj": 48.487876892089844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1381273865699768, "geo/layer_21/attn_entropy_mean": 5.86893367767334, "geo/layer_21/attn_entropy_std": 0.3390011489391327, "geo/layer_27/stable_rank_q_proj": 45.488624572753906, "geo/layer_27/stable_rank_k_proj": 30.428009033203125, "geo/layer_27/stable_rank_o_proj": 106.52822875976562, "geo/layer_27/stable_rank_gate_proj": 68.88688659667969, "geo/layer_27/stable_rank_down_proj": 129.5972442626953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09733413904905319, "geo/layer_27/attn_entropy_mean": 4.31506872177124, "geo/layer_27/attn_entropy_std": 0.6959003210067749, "attnres/final_alpha/block_0": 0.2633150815963745, "attnres/block_norm/0": 1.78486967086792, "attnres/final_alpha/block_1": 0.0036783511750400066, "attnres/block_norm/1": 50613.51171875, "attnres/final_alpha/block_2": 0.008006295189261436, "attnres/block_norm/2": 30139.31640625, "attnres/final_alpha/block_3": 0.010173995047807693, "attnres/block_norm/3": 73781.0, "attnres/final_alpha/block_4": 0.011710645630955696, "attnres/block_norm/4": 17821.00390625, "attnres/final_alpha/block_5": 0.6052815914154053, "attnres/block_norm/5": 7295.4208984375, "attnres/final_alpha/block_6": 0.09783400595188141, "attnres/block_norm/6": 50076.3515625, "geo/tier1_time_s": 1.3680148124694824, "geo/step": 15600.0, "geo/rankme_slope": 5.821881486969788e-05} {"step": 15610, "timestamp": 1778342585.551272, "train/loss": 2.3054915189743044, "train/z_loss": 0.001370481273625046, "train/perplexity": 10.029106537311375, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.01999919593334198, "optim/adamw_lr": 0.0005999758780002594, "perf/tokens_per_sec": 1786198.6709547166, "perf/iters_per_sec": 0.8517258982442458, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1740866422653198, "data/tokens_consumed": 32738639872, "data/tokens_consumed_B": 32.738639872, "train/loss_slope": -5.045536849865662e-06} {"step": 15620, "timestamp": 1778342595.932978, "train/loss": 2.3383501291275026, "train/z_loss": 0.0013650701846927404, "train/perplexity": 10.364122984385743, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.019995257258415222, "optim/adamw_lr": 0.0005998577177524566, "perf/tokens_per_sec": 2021440.2361649668, "perf/iters_per_sec": 0.9638978176903567, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03745436668396, "data/tokens_consumed": 32759611392, "data/tokens_consumed_B": 32.759611392, "train/loss_slope": -6.573237394234963e-06} {"step": 15630, "timestamp": 1778342606.3143892, "train/loss": 2.33527090549469, "train/z_loss": 0.0013758899993263185, "train/perplexity": 10.33225861589424, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.019988037943840027, "optim/adamw_lr": 0.0005996411383152008, "perf/tokens_per_sec": 2021566.2763030375, "perf/iters_per_sec": 0.9639579183116138, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373896837234498, "data/tokens_consumed": 32780582912, "data/tokens_consumed_B": 32.780582912, "train/loss_slope": -5.603678148500213e-06} {"step": 15640, "timestamp": 1778342616.6964343, "train/loss": 2.2980062484741213, "train/z_loss": 0.0013735089567489922, "train/perplexity": 9.954316224070734, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.019977539777755737, "optim/adamw_lr": 0.0005993261933326721, "perf/tokens_per_sec": 2021583.8850781706, "perf/iters_per_sec": 0.9639663148299077, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373806476593017, "data/tokens_consumed": 32801554432, "data/tokens_consumed_B": 32.801554432, "train/loss_slope": -8.467385577897917e-06} {"step": 15650, "timestamp": 1778342627.065231, "grad/layer_0/attn": 0.003147788578644395, "grad/layer_0/mlp": 0.0035120630636811256, "grad/layer_0/attn_mlp_ratio": 0.8962790337019677, "grad/layer_4/attn": 0.002588076749816537, "grad/layer_4/mlp": 0.0027750192675739527, "grad/layer_4/attn_mlp_ratio": 0.9326337610678194, "grad/layer_8/attn": 0.00694397883489728, "grad/layer_8/mlp": 0.003551794681698084, "grad/layer_8/attn_mlp_ratio": 1.955061950842053, "grad/layer_12/attn": 0.00803947914391756, "grad/layer_12/mlp": 0.007733633741736412, "grad/layer_12/attn_mlp_ratio": 1.0395474247216854, "grad/layer_16/attn": 0.003416865598410368, "grad/layer_16/mlp": 0.004250229336321354, "grad/layer_16/attn_mlp_ratio": 0.8039249761932202, "grad/layer_20/attn": 0.0035276522394269705, "grad/layer_20/mlp": 0.0059639825485646725, "grad/layer_20/attn_mlp_ratio": 0.5914927066858514, "grad/layer_24/attn": 0.008167585358023643, "grad/layer_24/mlp": 0.010598967783153057, "grad/layer_24/attn_mlp_ratio": 0.7706019537058821, "grad/layer_27/attn": 0.012333047576248646, "grad/layer_27/mlp": 0.009499238803982735, "grad/layer_27/attn_mlp_ratio": 1.2983195496933744} {"step": 15650, "timestamp": 1778342627.0803804, "train/loss": 2.3348555326461793, "train/z_loss": 0.00136487886775285, "train/perplexity": 10.32796776741408, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.019963766932487487, "optim/adamw_lr": 0.0005989130079746246, "perf/tokens_per_sec": 2021253.1793918826, "perf/iters_per_sec": 0.9638086220702565, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0375503778457642, "data/tokens_consumed": 32822525952, "data/tokens_consumed_B": 32.822525952, "train/loss_slope": -7.811046755424658e-06} {"step": 15660, "timestamp": 1778342637.4597232, "train/loss": 2.3572596311569214, "train/z_loss": 0.0013748673722147942, "train/perplexity": 10.56196806977223, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.019946723580360412, "optim/adamw_lr": 0.0005984017074108124, "perf/tokens_per_sec": 2021636.9920281095, "perf/iters_per_sec": 0.9639916381969974, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373533964157104, "data/tokens_consumed": 32843497472, "data/tokens_consumed_B": 32.843497472, "train/loss_slope": -7.4605146185470286e-06} {"step": 15670, "timestamp": 1778342647.8371987, "train/loss": 2.2942150115966795, "train/z_loss": 0.0013793957303278148, "train/perplexity": 9.916648502058711, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.019926415085792543, "optim/adamw_lr": 0.0005977924525737762, "perf/tokens_per_sec": 2022216.6106297579, "perf/iters_per_sec": 0.9642680218838491, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037056064605713, "data/tokens_consumed": 32864468992, "data/tokens_consumed_B": 32.864468992, "train/loss_slope": -1.2290572090045837e-05} {"step": 15675, "timestamp": 1778342653.6036217, "eos/sharpness": 58.53331089019774, "eos/L0_probe": 2.30100154876709, "eos/L_plus": 2.5709033012390137, "eos/L_minus": 2.6164329051971436, "eos/grad_norm": 0.1664426326751709, "eos/embed_grad_frac": 0.08475786447525024, "eos/time_s": 0.5888636112213135} {"step": 15675, "timestamp": 1778342654.9845145, "geo/rankme_last": 430.48345947265625, "geo/layer_0/stable_rank_q_proj": 20.620338439941406, "geo/layer_0/stable_rank_k_proj": 16.742773056030273, "geo/layer_0/stable_rank_o_proj": 43.77480697631836, "geo/layer_0/stable_rank_gate_proj": 123.47978210449219, "geo/layer_0/stable_rank_down_proj": 57.910335540771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06658954918384552, "geo/layer_0/attn_entropy_mean": 6.230069160461426, "geo/layer_0/attn_entropy_std": 0.46728184819221497, "geo/layer_7/stable_rank_q_proj": 41.67306900024414, "geo/layer_7/stable_rank_k_proj": 38.590999603271484, "geo/layer_7/stable_rank_o_proj": 87.06573486328125, "geo/layer_7/stable_rank_gate_proj": 77.24134063720703, "geo/layer_7/stable_rank_down_proj": 144.14389038085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3981913924217224, "geo/layer_7/attn_entropy_mean": 4.751292705535889, "geo/layer_7/attn_entropy_std": 0.7702372074127197, "geo/layer_14/stable_rank_q_proj": 51.648983001708984, "geo/layer_14/stable_rank_k_proj": 44.29928970336914, "geo/layer_14/stable_rank_o_proj": 42.183494567871094, "geo/layer_14/stable_rank_gate_proj": 71.90202331542969, "geo/layer_14/stable_rank_down_proj": 126.96385955810547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36519256234169006, "geo/layer_14/attn_entropy_mean": 5.508114337921143, "geo/layer_14/attn_entropy_std": 0.48488813638687134, "geo/layer_21/stable_rank_q_proj": 38.211143493652344, "geo/layer_21/stable_rank_k_proj": 28.369279861450195, "geo/layer_21/stable_rank_o_proj": 64.56326293945312, "geo/layer_21/stable_rank_gate_proj": 59.525211334228516, "geo/layer_21/stable_rank_down_proj": 48.4805793762207, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1306125968694687, "geo/layer_21/attn_entropy_mean": 5.830340385437012, "geo/layer_21/attn_entropy_std": 0.3412202298641205, "geo/layer_27/stable_rank_q_proj": 45.60175323486328, "geo/layer_27/stable_rank_k_proj": 30.471202850341797, "geo/layer_27/stable_rank_o_proj": 106.31315612792969, "geo/layer_27/stable_rank_gate_proj": 68.8935775756836, "geo/layer_27/stable_rank_down_proj": 129.77281188964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10422689467668533, "geo/layer_27/attn_entropy_mean": 4.307605743408203, "geo/layer_27/attn_entropy_std": 0.7139219045639038, "attnres/final_alpha/block_0": 0.2662999927997589, "attnres/block_norm/0": 1.7847330570220947, "attnres/final_alpha/block_1": 0.0037484029307961464, "attnres/block_norm/1": 50676.3046875, "attnres/final_alpha/block_2": 0.008261838927865028, "attnres/block_norm/2": 29985.17578125, "attnres/final_alpha/block_3": 0.010437319986522198, "attnres/block_norm/3": 73857.546875, "attnres/final_alpha/block_4": 0.011900344863533974, "attnres/block_norm/4": 17760.80859375, "attnres/final_alpha/block_5": 0.5987861752510071, "attnres/block_norm/5": 7358.4208984375, "attnres/final_alpha/block_6": 0.10056595504283905, "attnres/block_norm/6": 49640.59375, "geo/tier1_time_s": 1.36336350440979, "geo/step": 15675.0, "geo/rankme_slope": 6.733214770283114e-05} {"step": 15680, "timestamp": 1778342660.1747792, "train/loss": 2.3257515668869018, "train/z_loss": 0.0013712882297113537, "train/perplexity": 10.234369008914499, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.01990284860134125, "optim/adamw_lr": 0.0005970854580402374, "perf/tokens_per_sec": 1700760.6051642867, "perf/iters_per_sec": 0.8109858537503656, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2330671310424806, "data/tokens_consumed": 32885440512, "data/tokens_consumed_B": 32.885440512, "train/loss_slope": -1.1892266807132606e-05} {"step": 15690, "timestamp": 1778342670.553595, "train/loss": 2.328383755683899, "train/z_loss": 0.0013761338894255459, "train/perplexity": 10.261343285484312, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.01987603187561035, "optim/adamw_lr": 0.0005962809562683104, "perf/tokens_per_sec": 2022019.463043652, "perf/iters_per_sec": 0.9641740145891438, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0371571779251099, "data/tokens_consumed": 32906412032, "data/tokens_consumed_B": 32.906412032, "train/loss_slope": -1.143603525181763e-05} {"step": 15700, "timestamp": 1778342681.3365169, "grad/layer_0/attn": 0.0034091980196535587, "grad/layer_0/mlp": 0.003411766840144992, "grad/layer_0/attn_mlp_ratio": 0.9992470410387048, "grad/layer_4/attn": 0.0031247080769389868, "grad/layer_4/mlp": 0.0028355007525533438, "grad/layer_4/attn_mlp_ratio": 1.101995111066609, "grad/layer_8/attn": 0.004479403141885996, "grad/layer_8/mlp": 0.0035402148496359587, "grad/layer_8/attn_mlp_ratio": 1.2652912903908884, "grad/layer_12/attn": 0.008509461767971516, "grad/layer_12/mlp": 0.00802319124341011, "grad/layer_12/attn_mlp_ratio": 1.060608105147686, "grad/layer_16/attn": 0.003799538128077984, "grad/layer_16/mlp": 0.005253883544355631, "grad/layer_16/attn_mlp_ratio": 0.7231865768782139, "grad/layer_20/attn": 0.003003855235874653, "grad/layer_20/mlp": 0.0059786573983728886, "grad/layer_20/attn_mlp_ratio": 0.502429723845556, "grad/layer_24/attn": 0.005585489794611931, "grad/layer_24/mlp": 0.008095829747617245, "grad/layer_24/attn_mlp_ratio": 0.6899218362717746, "grad/layer_27/attn": 0.00870482623577118, "grad/layer_27/mlp": 0.006852213758975267, "grad/layer_27/attn_mlp_ratio": 1.2703669813762886} {"step": 15700, "timestamp": 1778342681.3512523, "train/loss": 2.305765461921692, "train/z_loss": 0.001371211081277579, "train/perplexity": 10.031854316666076, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.01984597325325012, "optim/adamw_lr": 0.0005953791975975036, "perf/tokens_per_sec": 1943516.1370802496, "perf/iters_per_sec": 0.9267407117272614, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0790504693984986, "data/tokens_consumed": 32927383552, "data/tokens_consumed_B": 32.927383552, "train/loss_slope": -1.4526378017554768e-05} {"step": 15710, "timestamp": 1778342691.731058, "train/loss": 2.2817140340805055, "train/z_loss": 0.001384558924473822, "train/perplexity": 9.793452342671344, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.019812682867050173, "optim/adamw_lr": 0.0005943804860115051, "perf/tokens_per_sec": 2021735.454026507, "perf/iters_per_sec": 0.9640385885365043, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0373028755187987, "data/tokens_consumed": 32948355072, "data/tokens_consumed_B": 32.948355072, "train/loss_slope": -1.882298886150049e-05} {"step": 15720, "timestamp": 1778342702.10919, "train/loss": 2.3260083913803102, "train/z_loss": 0.0013650950277224183, "train/perplexity": 10.23699778310292, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.01977617144584656, "optim/adamw_lr": 0.0005932851433753967, "perf/tokens_per_sec": 2021795.7719649952, "perf/iters_per_sec": 0.9640673503708816, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372719287872314, "data/tokens_consumed": 32969326592, "data/tokens_consumed_B": 32.969326592, "train/loss_slope": -1.7216275684689638e-05} {"step": 15730, "timestamp": 1778342712.4801745, "train/loss": 2.3358023881912233, "train/z_loss": 0.0013767740922048688, "train/perplexity": 10.33775149211973, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.019736450910568238, "optim/adamw_lr": 0.000592093527317047, "perf/tokens_per_sec": 2023557.0333743738, "perf/iters_per_sec": 0.9649071852561826, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363691091537475, "data/tokens_consumed": 32990298112, "data/tokens_consumed_B": 32.990298112, "train/loss_slope": -1.4380721218980006e-05} {"step": 15740, "timestamp": 1778342722.8516438, "train/loss": 2.3068516731262205, "train/z_loss": 0.001360397320240736, "train/perplexity": 10.042756949436011, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.01969353497028351, "optim/adamw_lr": 0.0005908060491085052, "perf/tokens_per_sec": 2023356.738681548, "perf/iters_per_sec": 0.9648116773040524, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036471700668335, "data/tokens_consumed": 33011269632, "data/tokens_consumed_B": 33.011269632, "train/loss_slope": -1.4766230741993487e-05} {"step": 15750, "timestamp": 1778342733.2168407, "grad/layer_0/attn": 0.003128892509266734, "grad/layer_0/mlp": 0.003494780045002699, "grad/layer_0/attn_mlp_ratio": 0.895304534032231, "grad/layer_4/attn": 0.001992968376725912, "grad/layer_4/mlp": 0.0028720239643007517, "grad/layer_4/attn_mlp_ratio": 0.6939246789393242, "grad/layer_8/attn": 0.005969684105366468, "grad/layer_8/mlp": 0.0037039038725197315, "grad/layer_8/attn_mlp_ratio": 1.6117275581810906, "grad/layer_12/attn": 0.007550091948360205, "grad/layer_12/mlp": 0.007578153163194656, "grad/layer_12/attn_mlp_ratio": 0.9962970774198064, "grad/layer_16/attn": 0.0039636376313865185, "grad/layer_16/mlp": 0.004837065003812313, "grad/layer_16/attn_mlp_ratio": 0.8194302839262168, "grad/layer_20/attn": 0.002832636469975114, "grad/layer_20/mlp": 0.005775109864771366, "grad/layer_20/attn_mlp_ratio": 0.4904904819569538, "grad/layer_24/attn": 0.005659738089889288, "grad/layer_24/mlp": 0.008209695108234882, "grad/layer_24/attn_mlp_ratio": 0.6893968589981496, "grad/layer_27/attn": 0.009694548323750496, "grad/layer_27/mlp": 0.009336099959909916, "grad/layer_27/attn_mlp_ratio": 1.038393789863049} {"step": 15750, "timestamp": 1778342733.808209, "eos/sharpness": 9.98415946960449, "eos/L0_probe": 2.303771734237671, "eos/L_plus": 2.357062816619873, "eos/L_minus": 2.3503222465515137, "eos/grad_norm": 0.10725628584623337, "eos/embed_grad_frac": 0.24526530504226685, "eos/time_s": 0.5886650085449219} {"step": 15750, "timestamp": 1778342733.8277025, "train/loss": 2.3105904340744017, "train/z_loss": 0.0013714635628275574, "train/perplexity": 10.080374694990217, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.01964743673801422, "optim/adamw_lr": 0.0005894231021404266, "perf/tokens_per_sec": 1911510.1055268026, "perf/iters_per_sec": 0.9114790465959561, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0971179246902465, "data/tokens_consumed": 33032241152, "data/tokens_consumed_B": 33.032241152, "train/loss_slope": -1.5042414824024748e-05} {"step": 15750, "timestamp": 1778342735.191333, "geo/rankme_last": 430.82623291015625, "geo/layer_0/stable_rank_q_proj": 20.64042091369629, "geo/layer_0/stable_rank_k_proj": 16.750255584716797, "geo/layer_0/stable_rank_o_proj": 43.82713317871094, "geo/layer_0/stable_rank_gate_proj": 123.4712905883789, "geo/layer_0/stable_rank_down_proj": 57.91765594482422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0644863024353981, "geo/layer_0/attn_entropy_mean": 6.230379581451416, "geo/layer_0/attn_entropy_std": 0.4677707254886627, "geo/layer_7/stable_rank_q_proj": 41.68408203125, "geo/layer_7/stable_rank_k_proj": 38.58655548095703, "geo/layer_7/stable_rank_o_proj": 87.04383850097656, "geo/layer_7/stable_rank_gate_proj": 77.07644653320312, "geo/layer_7/stable_rank_down_proj": 144.46707153320312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3809031546115875, "geo/layer_7/attn_entropy_mean": 4.750678062438965, "geo/layer_7/attn_entropy_std": 0.7655979990959167, "geo/layer_14/stable_rank_q_proj": 51.672607421875, "geo/layer_14/stable_rank_k_proj": 44.383583068847656, "geo/layer_14/stable_rank_o_proj": 42.13313674926758, "geo/layer_14/stable_rank_gate_proj": 71.8781509399414, "geo/layer_14/stable_rank_down_proj": 126.97454071044922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36524903774261475, "geo/layer_14/attn_entropy_mean": 5.4828410148620605, "geo/layer_14/attn_entropy_std": 0.5076273679733276, "geo/layer_21/stable_rank_q_proj": 38.305030822753906, "geo/layer_21/stable_rank_k_proj": 28.36281967163086, "geo/layer_21/stable_rank_o_proj": 64.43103790283203, "geo/layer_21/stable_rank_gate_proj": 59.54238510131836, "geo/layer_21/stable_rank_down_proj": 48.5372428894043, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13028572499752045, "geo/layer_21/attn_entropy_mean": 5.831111907958984, "geo/layer_21/attn_entropy_std": 0.3353140950202942, "geo/layer_27/stable_rank_q_proj": 45.64500045776367, "geo/layer_27/stable_rank_k_proj": 30.447397232055664, "geo/layer_27/stable_rank_o_proj": 106.28030395507812, "geo/layer_27/stable_rank_gate_proj": 68.80934143066406, "geo/layer_27/stable_rank_down_proj": 129.47018432617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10640129446983337, "geo/layer_27/attn_entropy_mean": 4.301817417144775, "geo/layer_27/attn_entropy_std": 0.6903704404830933, "attnres/final_alpha/block_0": 0.26453784108161926, "attnres/block_norm/0": 1.7846643924713135, "attnres/final_alpha/block_1": 0.003692199010401964, "attnres/block_norm/1": 50919.0546875, "attnres/final_alpha/block_2": 0.008028706535696983, "attnres/block_norm/2": 30074.169921875, "attnres/final_alpha/block_3": 0.010301658883690834, "attnres/block_norm/3": 73585.375, "attnres/final_alpha/block_4": 0.011773078702390194, "attnres/block_norm/4": 17834.0703125, "attnres/final_alpha/block_5": 0.6030048131942749, "attnres/block_norm/5": 7301.73291015625, "attnres/final_alpha/block_6": 0.09866167604923248, "attnres/block_norm/6": 49873.21484375, "geo/tier1_time_s": 1.3601925373077393, "geo/step": 15750.0, "geo/rankme_slope": 0.0001105875162565026} {"step": 15760, "timestamp": 1778342745.5606747, "train/loss": 2.2940879583358766, "train/z_loss": 0.0013828996219672264, "train/perplexity": 9.915388639566803, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.019598172307014466, "optim/adamw_lr": 0.000587945169210434, "perf/tokens_per_sec": 1787879.928792073, "perf/iters_per_sec": 0.8525275844536175, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1729825735092163, "data/tokens_consumed": 33053212672, "data/tokens_consumed_B": 33.053212672, "train/loss_slope": -1.4636290370255503e-05} {"step": 15770, "timestamp": 1778342755.931576, "train/loss": 2.28426194190979, "train/z_loss": 0.0013681198004633188, "train/perplexity": 9.81843697232137, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.019545757174491883, "optim/adamw_lr": 0.0005863727152347564, "perf/tokens_per_sec": 2023469.1930091765, "perf/iters_per_sec": 0.9648652997060664, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036414098739624, "data/tokens_consumed": 33074184192, "data/tokens_consumed_B": 33.074184192, "train/loss_slope": -1.589067613903257e-05} {"step": 15780, "timestamp": 1778342766.313365, "train/loss": 2.307949423789978, "train/z_loss": 0.001366180321201682, "train/perplexity": 10.053787445802866, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.019490208625793457, "optim/adamw_lr": 0.0005847062587738037, "perf/tokens_per_sec": 2021050.2292091653, "perf/iters_per_sec": 0.9637118478818728, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0376545667648316, "data/tokens_consumed": 33095155712, "data/tokens_consumed_B": 33.095155712, "train/loss_slope": -1.6239517428228924e-05} {"step": 15790, "timestamp": 1778342776.6925094, "train/loss": 2.3816709995269774, "train/z_loss": 0.0013685016427189111, "train/perplexity": 10.822972944849592, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.01943154513835907, "optim/adamw_lr": 0.0005829463541507721, "perf/tokens_per_sec": 2022041.2631380719, "perf/iters_per_sec": 0.9641844096842155, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03714599609375, "data/tokens_consumed": 33116127232, "data/tokens_consumed_B": 33.116127232, "train/loss_slope": -1.3153996855297623e-05} {"step": 15800, "timestamp": 1778342787.0577679, "grad/layer_0/attn": 0.0037552942521870136, "grad/layer_0/mlp": 0.0038164937868714333, "grad/layer_0/attn_mlp_ratio": 0.9839644352910026, "grad/layer_4/attn": 0.002065635984763503, "grad/layer_4/mlp": 0.002746515441685915, "grad/layer_4/attn_mlp_ratio": 0.7520933173003411, "grad/layer_8/attn": 0.005235305987298489, "grad/layer_8/mlp": 0.003464262466877699, "grad/layer_8/attn_mlp_ratio": 1.5112324444902026, "grad/layer_12/attn": 0.007686425466090441, "grad/layer_12/mlp": 0.007823135703802109, "grad/layer_12/attn_mlp_ratio": 0.982524867119752, "grad/layer_16/attn": 0.004040274303406477, "grad/layer_16/mlp": 0.004588468465954065, "grad/layer_16/attn_mlp_ratio": 0.8805278374106931, "grad/layer_20/attn": 0.0032584513537585735, "grad/layer_20/mlp": 0.006657203193753958, "grad/layer_20/attn_mlp_ratio": 0.48946249798556957, "grad/layer_24/attn": 0.007096318062394857, "grad/layer_24/mlp": 0.008929118514060974, "grad/layer_24/attn_mlp_ratio": 0.7947389175926115, "grad/layer_27/attn": 0.007944699376821518, "grad/layer_27/mlp": 0.009250765666365623, "grad/layer_27/attn_mlp_ratio": 0.8588153216144804} {"step": 15800, "timestamp": 1778342787.0718474, "train/loss": 2.3545702934265136, "train/z_loss": 0.0013579313759692012, "train/perplexity": 10.533601531232852, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.019369786381721498, "optim/adamw_lr": 0.0005810935914516448, "perf/tokens_per_sec": 2021440.282619918, "perf/iters_per_sec": 0.9638978398418035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374543428421021, "data/tokens_consumed": 33137098752, "data/tokens_consumed_B": 33.137098752, "train/loss_slope": -1.0987887609981305e-05} {"step": 15810, "timestamp": 1778342797.4516137, "train/loss": 2.291371726989746, "train/z_loss": 0.0013682254939340055, "train/perplexity": 9.88849269447584, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.019304951429367067, "optim/adamw_lr": 0.0005791485428810119, "perf/tokens_per_sec": 2021847.1705214675, "perf/iters_per_sec": 0.9640918591124856, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0372455596923829, "data/tokens_consumed": 33158070272, "data/tokens_consumed_B": 33.158070272, "train/loss_slope": -1.366789333104292e-05} {"step": 15820, "timestamp": 1778342807.8215103, "train/loss": 2.309589982032776, "train/z_loss": 0.0013688916340470314, "train/perplexity": 10.070294806609466, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.019237062335014345, "optim/adamw_lr": 0.0005771118700504303, "perf/tokens_per_sec": 2023759.6958042495, "perf/iters_per_sec": 0.965003822233319, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362653255462646, "data/tokens_consumed": 33179041792, "data/tokens_consumed_B": 33.179041792, "train/loss_slope": -1.432092106095518e-05} {"step": 15825, "timestamp": 1778342813.5807135, "eos/sharpness": 45.584607124328606, "eos/L0_probe": 2.304332971572876, "eos/L_plus": 2.5075161457061768, "eos/L_minus": 2.5569958686828613, "eos/grad_norm": 0.1266181617975235, "eos/embed_grad_frac": 0.13533452153205872, "eos/time_s": 0.5893306732177734} {"step": 15825, "timestamp": 1778342814.9589162, "geo/rankme_last": 429.5009765625, "geo/layer_0/stable_rank_q_proj": 20.59468650817871, "geo/layer_0/stable_rank_k_proj": 16.71209716796875, "geo/layer_0/stable_rank_o_proj": 43.78819274902344, "geo/layer_0/stable_rank_gate_proj": 123.64015197753906, "geo/layer_0/stable_rank_down_proj": 57.88359069824219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06814465671777725, "geo/layer_0/attn_entropy_mean": 6.230079650878906, "geo/layer_0/attn_entropy_std": 0.47223079204559326, "geo/layer_7/stable_rank_q_proj": 41.74440383911133, "geo/layer_7/stable_rank_k_proj": 38.5247917175293, "geo/layer_7/stable_rank_o_proj": 86.91398620605469, "geo/layer_7/stable_rank_gate_proj": 77.1293716430664, "geo/layer_7/stable_rank_down_proj": 144.36203002929688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4003046751022339, "geo/layer_7/attn_entropy_mean": 4.740923881530762, "geo/layer_7/attn_entropy_std": 0.7536014318466187, "geo/layer_14/stable_rank_q_proj": 51.713314056396484, "geo/layer_14/stable_rank_k_proj": 44.461090087890625, "geo/layer_14/stable_rank_o_proj": 42.114383697509766, "geo/layer_14/stable_rank_gate_proj": 71.76407623291016, "geo/layer_14/stable_rank_down_proj": 126.42796325683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3547578752040863, "geo/layer_14/attn_entropy_mean": 5.497154235839844, "geo/layer_14/attn_entropy_std": 0.48608532547950745, "geo/layer_21/stable_rank_q_proj": 38.290889739990234, "geo/layer_21/stable_rank_k_proj": 28.451108932495117, "geo/layer_21/stable_rank_o_proj": 64.39277648925781, "geo/layer_21/stable_rank_gate_proj": 59.4401969909668, "geo/layer_21/stable_rank_down_proj": 48.52339172363281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13320565223693848, "geo/layer_21/attn_entropy_mean": 5.856690406799316, "geo/layer_21/attn_entropy_std": 0.33634334802627563, "geo/layer_27/stable_rank_q_proj": 45.62370681762695, "geo/layer_27/stable_rank_k_proj": 30.47734260559082, "geo/layer_27/stable_rank_o_proj": 106.17288208007812, "geo/layer_27/stable_rank_gate_proj": 68.73064422607422, "geo/layer_27/stable_rank_down_proj": 129.3596649169922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09783776849508286, "geo/layer_27/attn_entropy_mean": 4.311175346374512, "geo/layer_27/attn_entropy_std": 0.7101014852523804, "attnres/final_alpha/block_0": 0.2644610106945038, "attnres/block_norm/0": 1.7846450805664062, "attnres/final_alpha/block_1": 0.003721491666510701, "attnres/block_norm/1": 50908.5546875, "attnres/final_alpha/block_2": 0.008065612055361271, "attnres/block_norm/2": 30181.5703125, "attnres/final_alpha/block_3": 0.010387198999524117, "attnres/block_norm/3": 73977.34375, "attnres/final_alpha/block_4": 0.011657936498522758, "attnres/block_norm/4": 17929.033203125, "attnres/final_alpha/block_5": 0.6020218133926392, "attnres/block_norm/5": 7271.02197265625, "attnres/final_alpha/block_6": 0.0996849536895752, "attnres/block_norm/6": 50016.91015625, "geo/tier1_time_s": 1.3609199523925781, "geo/step": 15825.0, "geo/rankme_slope": 0.00010159542332558023} {"step": 15830, "timestamp": 1778342820.574907, "train/loss": 2.338949751853943, "train/z_loss": 0.0013749516801908612, "train/perplexity": 10.370339411636074, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.019166141152381896, "optim/adamw_lr": 0.0005749842345714568, "perf/tokens_per_sec": 1644977.4227942238, "perf/iters_per_sec": 0.7843863595934981, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.274881935119629, "data/tokens_consumed": 33200013312, "data/tokens_consumed_B": 33.200013312, "train/loss_slope": -1.0690241234816744e-05} {"step": 15840, "timestamp": 1778342830.9579153, "train/loss": 2.271090936660767, "train/z_loss": 0.0013751654536463321, "train/perplexity": 9.689966189304897, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.01909221172332764, "optim/adamw_lr": 0.000572766351699829, "perf/tokens_per_sec": 2021356.6671646468, "perf/iters_per_sec": 0.9638579688857302, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374972581863404, "data/tokens_consumed": 33220984832, "data/tokens_consumed_B": 33.220984832, "train/loss_slope": -1.1294350179150807e-05} {"step": 15850, "timestamp": 1778342841.8779714, "grad/layer_0/attn": 0.0030988052021712065, "grad/layer_0/mlp": 0.0032784000504761934, "grad/layer_0/attn_mlp_ratio": 0.9452187225287613, "grad/layer_4/attn": 0.0019704720471054316, "grad/layer_4/mlp": 0.0027345619164407253, "grad/layer_4/attn_mlp_ratio": 0.7205804934240165, "grad/layer_8/attn": 0.006850591395050287, "grad/layer_8/mlp": 0.0035237190313637257, "grad/layer_8/attn_mlp_ratio": 1.9441366180621265, "grad/layer_12/attn": 0.004812290892004967, "grad/layer_12/mlp": 0.006651376374065876, "grad/layer_12/attn_mlp_ratio": 0.7235030088536386, "grad/layer_16/attn": 0.0032568771857768297, "grad/layer_16/mlp": 0.004498220980167389, "grad/layer_16/attn_mlp_ratio": 0.7240367086749845, "grad/layer_20/attn": 0.0033951830118894577, "grad/layer_20/mlp": 0.005261282902210951, "grad/layer_20/attn_mlp_ratio": 0.6453146524265467, "grad/layer_24/attn": 0.0054513029754161835, "grad/layer_24/mlp": 0.007117586210370064, "grad/layer_24/attn_mlp_ratio": 0.7658920788180446, "grad/layer_27/attn": 0.005610332824289799, "grad/layer_27/mlp": 0.00622257124632597, "grad/layer_27/attn_mlp_ratio": 0.9016100438289617} {"step": 15850, "timestamp": 1778342841.8924625, "train/loss": 2.336413860321045, "train/z_loss": 0.0013725049910135567, "train/perplexity": 10.34407467206932, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.019015297889709473, "optim/adamw_lr": 0.0005704589366912841, "perf/tokens_per_sec": 1919523.795145594, "perf/iters_per_sec": 0.9153002715805025, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0925376415252686, "data/tokens_consumed": 33241956352, "data/tokens_consumed_B": 33.241956352, "train/loss_slope": -8.027345383807381e-06} {"step": 15860, "timestamp": 1778342852.2686675, "train/loss": 2.3002145290374756, "train/z_loss": 0.001359392935410142, "train/perplexity": 9.976322436112214, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.01893542468547821, "optim/adamw_lr": 0.0005680627405643462, "perf/tokens_per_sec": 2022491.2216412183, "perf/iters_per_sec": 0.9643989666181652, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369152545928955, "data/tokens_consumed": 33262927872, "data/tokens_consumed_B": 33.262927872, "train/loss_slope": -6.8841002991060504e-06} {"step": 15870, "timestamp": 1778342862.646369, "train/loss": 2.319690465927124, "train/z_loss": 0.0013761449372395873, "train/perplexity": 10.17252507558333, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.018852618336677552, "optim/adamw_lr": 0.0005655785501003265, "perf/tokens_per_sec": 2022210.7528259729, "perf/iters_per_sec": 0.9642652286653389, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0370590686798096, "data/tokens_consumed": 33283899392, "data/tokens_consumed_B": 33.283899392, "train/loss_slope": -8.506595026148341e-06} {"step": 15880, "timestamp": 1778342873.0140836, "train/loss": 2.3272634744644165, "train/z_loss": 0.0013666622573509813, "train/perplexity": 10.249854132057928, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.018766906261444092, "optim/adamw_lr": 0.0005630071878433227, "perf/tokens_per_sec": 2024166.0740761468, "perf/iters_per_sec": 0.9651975984936461, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360572814941407, "data/tokens_consumed": 33304870912, "data/tokens_consumed_B": 33.304870912, "train/loss_slope": -6.592949124166486e-06} {"step": 15890, "timestamp": 1778342883.3817432, "train/loss": 2.3084779500961305, "train/z_loss": 0.0013685621204786002, "train/perplexity": 10.059102541404636, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.018678317070007326, "optim/adamw_lr": 0.0005603495121002197, "perf/tokens_per_sec": 2023667.3683264113, "perf/iters_per_sec": 0.9649597970611626, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0363126039505004, "data/tokens_consumed": 33325842432, "data/tokens_consumed_B": 33.325842432, "train/loss_slope": -7.833705605095629e-06} {"step": 15900, "timestamp": 1778342893.7450619, "grad/layer_0/attn": 0.003513667732477188, "grad/layer_0/mlp": 0.003482232568785548, "grad/layer_0/attn_mlp_ratio": 1.0090272726384486, "grad/layer_4/attn": 0.0019571606535464525, "grad/layer_4/mlp": 0.0026666519697755575, "grad/layer_4/attn_mlp_ratio": 0.7339392625417307, "grad/layer_8/attn": 0.0037888074293732643, "grad/layer_8/mlp": 0.0037561573553830385, "grad/layer_8/attn_mlp_ratio": 1.0086923869348008, "grad/layer_12/attn": 0.006260007619857788, "grad/layer_12/mlp": 0.006675963755697012, "grad/layer_12/attn_mlp_ratio": 0.9376934559817512, "grad/layer_16/attn": 0.003361489623785019, "grad/layer_16/mlp": 0.004411778412759304, "grad/layer_16/attn_mlp_ratio": 0.7619352635367472, "grad/layer_20/attn": 0.003974664956331253, "grad/layer_20/mlp": 0.005608211737126112, "grad/layer_20/attn_mlp_ratio": 0.7087223292849155, "grad/layer_24/attn": 0.010131076909601688, "grad/layer_24/mlp": 0.00929850060492754, "grad/layer_24/attn_mlp_ratio": 1.0895387580314904, "grad/layer_27/attn": 0.006713937036693096, "grad/layer_27/mlp": 0.009567891247570515, "grad/layer_27/attn_mlp_ratio": 0.701715434759604} {"step": 15900, "timestamp": 1778342894.3427224, "eos/sharpness": 27.86188125610351, "eos/L0_probe": 2.2990615367889404, "eos/L_plus": 2.4444313049316406, "eos/L_minus": 2.4323105812072754, "eos/grad_norm": 0.1137162521481514, "eos/embed_grad_frac": 0.19671133160591125, "eos/time_s": 0.5948946475982666} {"step": 15900, "timestamp": 1778342894.3629498, "train/loss": 2.33903431892395, "train/z_loss": 0.001372748031280935, "train/perplexity": 10.371216437938346, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.018586878776550294, "optim/adamw_lr": 0.0005576063632965087, "perf/tokens_per_sec": 1910898.213943431, "perf/iters_per_sec": 0.9111872739522128, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0974692344665526, "data/tokens_consumed": 33346813952, "data/tokens_consumed_B": 33.346813952, "train/loss_slope": -5.660312382480217e-06} {"step": 15900, "timestamp": 1778342895.7293243, "geo/rankme_last": 430.1601257324219, "geo/layer_0/stable_rank_q_proj": 20.58173370361328, "geo/layer_0/stable_rank_k_proj": 16.700483322143555, "geo/layer_0/stable_rank_o_proj": 43.83102035522461, "geo/layer_0/stable_rank_gate_proj": 123.7457504272461, "geo/layer_0/stable_rank_down_proj": 57.81119918823242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061220910400152206, "geo/layer_0/attn_entropy_mean": 6.228171348571777, "geo/layer_0/attn_entropy_std": 0.4723036587238312, "geo/layer_7/stable_rank_q_proj": 41.72266387939453, "geo/layer_7/stable_rank_k_proj": 38.467376708984375, "geo/layer_7/stable_rank_o_proj": 86.85731506347656, "geo/layer_7/stable_rank_gate_proj": 77.1529312133789, "geo/layer_7/stable_rank_down_proj": 144.10484313964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.397687703371048, "geo/layer_7/attn_entropy_mean": 4.728631019592285, "geo/layer_7/attn_entropy_std": 0.7555546760559082, "geo/layer_14/stable_rank_q_proj": 51.78824234008789, "geo/layer_14/stable_rank_k_proj": 44.681671142578125, "geo/layer_14/stable_rank_o_proj": 42.11841583251953, "geo/layer_14/stable_rank_gate_proj": 71.61360931396484, "geo/layer_14/stable_rank_down_proj": 126.4039535522461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3602641224861145, "geo/layer_14/attn_entropy_mean": 5.514652729034424, "geo/layer_14/attn_entropy_std": 0.495060533285141, "geo/layer_21/stable_rank_q_proj": 38.26542663574219, "geo/layer_21/stable_rank_k_proj": 28.413089752197266, "geo/layer_21/stable_rank_o_proj": 64.31171417236328, "geo/layer_21/stable_rank_gate_proj": 59.38092041015625, "geo/layer_21/stable_rank_down_proj": 48.55525588989258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1362989991903305, "geo/layer_21/attn_entropy_mean": 5.842811584472656, "geo/layer_21/attn_entropy_std": 0.33761459589004517, "geo/layer_27/stable_rank_q_proj": 45.64853286743164, "geo/layer_27/stable_rank_k_proj": 30.41754722595215, "geo/layer_27/stable_rank_o_proj": 106.071044921875, "geo/layer_27/stable_rank_gate_proj": 68.65311431884766, "geo/layer_27/stable_rank_down_proj": 129.38827514648438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09803148359060287, "geo/layer_27/attn_entropy_mean": 4.295236587524414, "geo/layer_27/attn_entropy_std": 0.6866666674613953, "attnres/final_alpha/block_0": 0.26453831791877747, "attnres/block_norm/0": 1.7847856283187866, "attnres/final_alpha/block_1": 0.0037468753289431334, "attnres/block_norm/1": 50741.23046875, "attnres/final_alpha/block_2": 0.008102982304990292, "attnres/block_norm/2": 30144.6953125, "attnres/final_alpha/block_3": 0.010217774659395218, "attnres/block_norm/3": 73787.4375, "attnres/final_alpha/block_4": 0.011628778651356697, "attnres/block_norm/4": 17882.84375, "attnres/final_alpha/block_5": 0.6026296615600586, "attnres/block_norm/5": 7256.82958984375, "attnres/final_alpha/block_6": 0.09913556277751923, "attnres/block_norm/6": 49779.859375, "geo/tier1_time_s": 1.3623201847076416, "geo/step": 15900.0, "geo/rankme_slope": 0.00011426973133003201} {"step": 15910, "timestamp": 1778342906.5119355, "train/loss": 2.3031521081924438, "train/z_loss": 0.0013740010559558868, "train/perplexity": 10.005671759819032, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.018492622375488283, "optim/adamw_lr": 0.0005547786712646484, "perf/tokens_per_sec": 1726768.7516363494, "perf/iters_per_sec": 0.8233875044042346, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2144949913024903, "data/tokens_consumed": 33367785472, "data/tokens_consumed_B": 33.367785472, "train/loss_slope": -8.603677254627222e-06} {"step": 15920, "timestamp": 1778342916.8838458, "train/loss": 2.3333439111709593, "train/z_loss": 0.0013630428002215922, "train/perplexity": 10.312367583298673, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.018395577669143678, "optim/adamw_lr": 0.0005518673300743103, "perf/tokens_per_sec": 2023240.8995911076, "perf/iters_per_sec": 0.9647564409213579, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365310430526733, "data/tokens_consumed": 33388756992, "data/tokens_consumed_B": 33.388756992, "train/loss_slope": -7.95326151362839e-06} {"step": 15930, "timestamp": 1778342927.2580724, "train/loss": 2.331610679626465, "train/z_loss": 0.001372234825976193, "train/perplexity": 10.294509343208054, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.018295778036117556, "optim/adamw_lr": 0.0005488733410835265, "perf/tokens_per_sec": 2022936.2162969706, "perf/iters_per_sec": 0.9646111566052297, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036687159538269, "data/tokens_consumed": 33409728512, "data/tokens_consumed_B": 33.409728512, "train/loss_slope": -6.286482062741711e-06} {"step": 15940, "timestamp": 1778342937.6373956, "train/loss": 2.3034165620803835, "train/z_loss": 0.0013685439596883952, "train/perplexity": 10.008318148525833, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.018193255066871642, "optim/adamw_lr": 0.0005457976520061493, "perf/tokens_per_sec": 2021489.8048122393, "perf/iters_per_sec": 0.963921453863258, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0374289274215698, "data/tokens_consumed": 33430700032, "data/tokens_consumed_B": 33.430700032, "train/loss_slope": -5.9386959814144474e-06} {"step": 15950, "timestamp": 1778342947.997378, "grad/layer_0/attn": 0.003061617724597454, "grad/layer_0/mlp": 0.0033726103138178587, "grad/layer_0/attn_mlp_ratio": 0.9077887300750059, "grad/layer_4/attn": 0.001890641637146473, "grad/layer_4/mlp": 0.00263175624422729, "grad/layer_4/attn_mlp_ratio": 0.7183953945028236, "grad/layer_8/attn": 0.004156041424721479, "grad/layer_8/mlp": 0.003435826860368252, "grad/layer_8/attn_mlp_ratio": 1.2096189571421359, "grad/layer_12/attn": 0.008490169420838356, "grad/layer_12/mlp": 0.007454909384250641, "grad/layer_12/attn_mlp_ratio": 1.1388695515049274, "grad/layer_16/attn": 0.003376440377905965, "grad/layer_16/mlp": 0.004677006974816322, "grad/layer_16/attn_mlp_ratio": 0.7219232992155707, "grad/layer_20/attn": 0.00493444362655282, "grad/layer_20/mlp": 0.005363774485886097, "grad/layer_20/attn_mlp_ratio": 0.9199573075902572, "grad/layer_24/attn": 0.004922959487885237, "grad/layer_24/mlp": 0.007236123085021973, "grad/layer_24/attn_mlp_ratio": 0.6803310781213971, "grad/layer_27/attn": 0.006723604165017605, "grad/layer_27/mlp": 0.006595872808247805, "grad/layer_27/attn_mlp_ratio": 1.0193653301915626} {"step": 15950, "timestamp": 1778342948.0115504, "train/loss": 2.3128082275390627, "train/z_loss": 0.0013808370917104185, "train/perplexity": 10.102755693152217, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.018088042736053467, "optim/adamw_lr": 0.0005426412820816039, "perf/tokens_per_sec": 2022471.8764666272, "perf/iters_per_sec": 0.9643897421200882, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0369251728057862, "data/tokens_consumed": 33451671552, "data/tokens_consumed_B": 33.451671552, "train/loss_slope": -4.927917360865919e-06} {"step": 15960, "timestamp": 1778342958.38115, "train/loss": 2.3317637920379637, "train/z_loss": 0.0013775474508292973, "train/perplexity": 10.296085681034153, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.017980175614356993, "optim/adamw_lr": 0.0005394052684307098, "perf/tokens_per_sec": 2023785.0722047172, "perf/iters_per_sec": 0.9650159226440035, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0362523317337036, "data/tokens_consumed": 33472643072, "data/tokens_consumed_B": 33.472643072, "train/loss_slope": -6.6823922153567e-06} {"step": 15970, "timestamp": 1778342968.7540715, "train/loss": 2.2933284044265747, "train/z_loss": 0.0013776203617453576, "train/perplexity": 9.907860226842972, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.017869689464569093, "optim/adamw_lr": 0.0005360906839370727, "perf/tokens_per_sec": 2022546.6085505679, "perf/iters_per_sec": 0.9644253771546211, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368868589401246, "data/tokens_consumed": 33493614592, "data/tokens_consumed_B": 33.493614592, "train/loss_slope": -8.176657189988014e-06} {"step": 15975, "timestamp": 1778342974.508026, "eos/sharpness": 64.62233066558836, "eos/L0_probe": 2.294407844543457, "eos/L_plus": 2.590055227279663, "eos/L_minus": 2.6449837684631348, "eos/grad_norm": 0.20644716918468475, "eos/embed_grad_frac": 0.05465845763683319, "eos/time_s": 0.5841727256774902} {"step": 15975, "timestamp": 1778342975.8882902, "geo/rankme_last": 430.3750915527344, "geo/layer_0/stable_rank_q_proj": 20.604496002197266, "geo/layer_0/stable_rank_k_proj": 16.699100494384766, "geo/layer_0/stable_rank_o_proj": 43.809349060058594, "geo/layer_0/stable_rank_gate_proj": 123.69812774658203, "geo/layer_0/stable_rank_down_proj": 57.91261672973633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06565482169389725, "geo/layer_0/attn_entropy_mean": 6.230963706970215, "geo/layer_0/attn_entropy_std": 0.46849173307418823, "geo/layer_7/stable_rank_q_proj": 41.70732498168945, "geo/layer_7/stable_rank_k_proj": 38.48662567138672, "geo/layer_7/stable_rank_o_proj": 86.91468048095703, "geo/layer_7/stable_rank_gate_proj": 77.1031723022461, "geo/layer_7/stable_rank_down_proj": 144.07032775878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39099550247192383, "geo/layer_7/attn_entropy_mean": 4.723938465118408, "geo/layer_7/attn_entropy_std": 0.7456281185150146, "geo/layer_14/stable_rank_q_proj": 51.69143295288086, "geo/layer_14/stable_rank_k_proj": 44.73249816894531, "geo/layer_14/stable_rank_o_proj": 42.11690902709961, "geo/layer_14/stable_rank_gate_proj": 71.67448425292969, "geo/layer_14/stable_rank_down_proj": 126.54817962646484, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3758135139942169, "geo/layer_14/attn_entropy_mean": 5.500011920928955, "geo/layer_14/attn_entropy_std": 0.4733656942844391, "geo/layer_21/stable_rank_q_proj": 38.178951263427734, "geo/layer_21/stable_rank_k_proj": 28.41594696044922, "geo/layer_21/stable_rank_o_proj": 64.25820922851562, "geo/layer_21/stable_rank_gate_proj": 59.35324478149414, "geo/layer_21/stable_rank_down_proj": 48.556068420410156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13877181708812714, "geo/layer_21/attn_entropy_mean": 5.845879554748535, "geo/layer_21/attn_entropy_std": 0.3280889391899109, "geo/layer_27/stable_rank_q_proj": 45.68538284301758, "geo/layer_27/stable_rank_k_proj": 30.39268684387207, "geo/layer_27/stable_rank_o_proj": 106.05884552001953, "geo/layer_27/stable_rank_gate_proj": 68.61553955078125, "geo/layer_27/stable_rank_down_proj": 129.4503173828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09236516803503036, "geo/layer_27/attn_entropy_mean": 4.311836242675781, "geo/layer_27/attn_entropy_std": 0.7014034390449524, "attnres/final_alpha/block_0": 0.2660254240036011, "attnres/block_norm/0": 1.7850239276885986, "attnres/final_alpha/block_1": 0.003772916505113244, "attnres/block_norm/1": 50713.45703125, "attnres/final_alpha/block_2": 0.008180662989616394, "attnres/block_norm/2": 30130.77734375, "attnres/final_alpha/block_3": 0.010369818657636642, "attnres/block_norm/3": 74183.328125, "attnres/final_alpha/block_4": 0.0116175701841712, "attnres/block_norm/4": 17840.16796875, "attnres/final_alpha/block_5": 0.6012067794799805, "attnres/block_norm/5": 7247.08984375, "attnres/final_alpha/block_6": 0.09882684051990509, "attnres/block_norm/6": 49636.6015625, "geo/tier1_time_s": 1.3609185218811035, "geo/step": 15975.0, "geo/rankme_slope": 0.00013245921415441176} {"step": 15980, "timestamp": 1778342981.0653563, "train/loss": 2.313625764846802, "train/z_loss": 0.001371725182980299, "train/perplexity": 10.111018449938099, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.017756619453430177, "optim/adamw_lr": 0.0005326985836029052, "perf/tokens_per_sec": 1704146.1231740536, "perf/iters_per_sec": 0.8126001945371883, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2306174755096435, "data/tokens_consumed": 33514586112, "data/tokens_consumed_B": 33.514586112, "train/loss_slope": -7.149491466060864e-06} {"step": 15990, "timestamp": 1778342991.4183662, "train/loss": 2.3423308372497558, "train/z_loss": 0.0013625257182866336, "train/perplexity": 10.40546175713598, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.017641003727912902, "optim/adamw_lr": 0.000529230111837387, "perf/tokens_per_sec": 2026866.0190306788, "perf/iters_per_sec": 0.96648503257307, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346771717071532, "data/tokens_consumed": 33535557632, "data/tokens_consumed_B": 33.535557632, "train/loss_slope": -4.9539252297497335e-06} {"step": 16000, "timestamp": 1778343001.78057, "grad/layer_0/attn": 0.002830119803547859, "grad/layer_0/mlp": 0.0031104381196200848, "grad/layer_0/attn_mlp_ratio": 0.9098781598348338, "grad/layer_4/attn": 0.0021741739474236965, "grad/layer_4/mlp": 0.002695281058549881, "grad/layer_4/attn_mlp_ratio": 0.806659424203985, "grad/layer_8/attn": 0.009306751191616058, "grad/layer_8/mlp": 0.0034241676330566406, "grad/layer_8/attn_mlp_ratio": 2.7179600758950646, "grad/layer_12/attn": 0.0097229378297925, "grad/layer_12/mlp": 0.00686703622341156, "grad/layer_12/attn_mlp_ratio": 1.415885597786109, "grad/layer_16/attn": 0.0044166892766952515, "grad/layer_16/mlp": 0.003990068566054106, "grad/layer_16/attn_mlp_ratio": 1.1069206187529208, "grad/layer_20/attn": 0.003458744380623102, "grad/layer_20/mlp": 0.005291163921356201, "grad/layer_20/attn_mlp_ratio": 0.6536830774217006, "grad/layer_24/attn": 0.0060046701692044735, "grad/layer_24/mlp": 0.0074243429116904736, "grad/layer_24/attn_mlp_ratio": 0.8087813507201158, "grad/layer_27/attn": 0.004994549788534641, "grad/layer_27/mlp": 0.006075754761695862, "grad/layer_27/attn_mlp_ratio": 0.8220459683162011} {"step": 16000, "timestamp": 1778343001.7948616, "train/loss": 2.3508025407791138, "train/z_loss": 0.0013692855951376258, "train/perplexity": 10.493988199656782, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.017522879838943482, "optim/adamw_lr": 0.0005256863951683044, "perf/tokens_per_sec": 2021958.4810544655, "perf/iters_per_sec": 0.9641449361107185, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037188458442688, "data/tokens_consumed": 33556529152, "data/tokens_consumed_B": 33.556529152, "train/loss_slope": -1.9591417178139555e-06} {"step": 16000, "timestamp": 1778343008.9376972, "geo/ww_alpha_mean": 7.553721795940166, "geo/ww_alpha_std": 4.32420629372809, "geo/ww_alpha_min": 1.3432955882848119, "geo/ww_alpha_max": 28.865363950327556, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.8828026668255395, "geo/ww_alpha_by_type/k_proj": 4.630744533507985, "geo/ww_alpha_by_type/v_proj": 8.525071641317423, "geo/ww_alpha_by_type/o_proj": 8.418574733229317, "geo/ww_alpha_by_type/gate_proj": 7.534136200660535, "geo/ww_alpha_by_type/up_proj": 11.929435306981537, "geo/ww_alpha_by_type/down_proj": 8.052288022362344, "geo/twonn_id/layer_0": 0.7636190056800842, "geo/twonn_id/layer_7": 3.6172287464141846, "geo/twonn_id/layer_14": 4.423274040222168, "geo/twonn_id/layer_21": 7.833996772766113, "geo/twonn_id/layer_27": 6.6500396728515625, "geo/tier2_time_s": 7.134920835494995} {"step": 16000, "timestamp": 1778343009.8183053, "eoc/jacobian_sigma/layer_0/attn": 1555.4344482421875, "eoc/jacobian_sigma/layer_0/mlp": 10907.947265625, "eoc/jacobian_sigma/layer_0": 10907.947265625, "eoc/jacobian_sigma/layer_7/attn": 1.128471851348877, "eoc/jacobian_sigma/layer_7/mlp": 1.8164825439453125, "eoc/jacobian_sigma/layer_7": 1.8164825439453125, "eoc/jacobian_sigma/layer_14/attn": 2.6581788063049316, "eoc/jacobian_sigma/layer_14/mlp": 16.8080997467041, "eoc/jacobian_sigma/layer_14": 16.8080997467041, "eoc/jacobian_sigma/layer_21/attn": 1.0888677835464478, "eoc/jacobian_sigma/layer_21/mlp": 5.234485626220703, "eoc/jacobian_sigma/layer_21": 5.234485626220703, "eoc/jacobian_sigma/layer_27/attn": 3.5579476356506348, "eoc/jacobian_sigma/layer_27/mlp": 53.60540008544922, "eoc/jacobian_sigma/layer_27": 53.60540008544922, "eoc/layer0_sigma": 10907.947265625, "eoc/sigma_max": 53.60540008544922, "eoc/sigma_min": 1.8164825439453125, "eoc/sigma_mean": 19.366117000579834, "eoc/time_s": 0.8720452785491943} {"step": 16010, "timestamp": 1778343020.196061, "train/loss": 2.2907174110412596, "train/z_loss": 0.00137283387593925, "train/perplexity": 9.882024612314774, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.017402287125587463, "optim/adamw_lr": 0.0005220686137676238, "perf/tokens_per_sec": 1140099.8375501935, "perf/iters_per_sec": 0.5436419666052787, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8394459247589112, "data/tokens_consumed": 33577500672, "data/tokens_consumed_B": 33.577500672, "train/loss_slope": -3.4210748297653093e-06} {"step": 16020, "timestamp": 1778343030.5560298, "train/loss": 2.295557904243469, "train/z_loss": 0.0013749151607044042, "train/perplexity": 9.929974442063722, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.017279264330863953, "optim/adamw_lr": 0.0005183779299259185, "perf/tokens_per_sec": 2025404.8434774203, "perf/iters_per_sec": 0.9657882897746183, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354236125946046, "data/tokens_consumed": 33598472192, "data/tokens_consumed_B": 33.598472192, "train/loss_slope": -7.710619027143138e-06} {"step": 16030, "timestamp": 1778343040.9076173, "train/loss": 2.3059685707092283, "train/z_loss": 0.0013822939479723572, "train/perplexity": 10.033892081370027, "train/grad_norm": 0.21875, "optim/muon_lr": 0.017153851985931397, "optim/adamw_lr": 0.0005146155595779419, "perf/tokens_per_sec": 2026836.68892492, "perf/iters_per_sec": 0.9664710468887902, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034692144393921, "data/tokens_consumed": 33619443712, "data/tokens_consumed_B": 33.619443712, "train/loss_slope": -8.663354433587856e-06} {"step": 16040, "timestamp": 1778343051.2593973, "train/loss": 2.30171856880188, "train/z_loss": 0.0013770873309113085, "train/perplexity": 9.991338511315053, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.017026091814041137, "optim/adamw_lr": 0.0005107827544212341, "perf/tokens_per_sec": 2027489.0639161703, "perf/iters_per_sec": 0.9667821235257007, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0343592166900635, "data/tokens_consumed": 33640415232, "data/tokens_consumed_B": 33.640415232, "train/loss_slope": -1.0589002783220489e-05} {"step": 16050, "timestamp": 1778343061.5970545, "grad/layer_0/attn": 0.0025925699155777693, "grad/layer_0/mlp": 0.002948042470961809, "grad/layer_0/attn_mlp_ratio": 0.8794207862242418, "grad/layer_4/attn": 0.002255756640806794, "grad/layer_4/mlp": 0.0024842149578034878, "grad/layer_4/attn_mlp_ratio": 0.9080359744704656, "grad/layer_8/attn": 0.0028393357060849667, "grad/layer_8/mlp": 0.0034258596133440733, "grad/layer_8/attn_mlp_ratio": 0.8287950890182293, "grad/layer_12/attn": 0.009432635270059109, "grad/layer_12/mlp": 0.006964607164263725, "grad/layer_12/attn_mlp_ratio": 1.3543671469400933, "grad/layer_16/attn": 0.0032551779877394438, "grad/layer_16/mlp": 0.003963664639741182, "grad/layer_16/attn_mlp_ratio": 0.8212546220425291, "grad/layer_20/attn": 0.0023155068047344685, "grad/layer_20/mlp": 0.0052616652101278305, "grad/layer_20/attn_mlp_ratio": 0.4400710931342408, "grad/layer_24/attn": 0.00938290823251009, "grad/layer_24/mlp": 0.01008925586938858, "grad/layer_24/attn_mlp_ratio": 0.9299901064041202, "grad/layer_27/attn": 0.0040538436733186245, "grad/layer_27/mlp": 0.01002966333180666, "grad/layer_27/attn_mlp_ratio": 0.40418541468329205} {"step": 16050, "timestamp": 1778343062.2393718, "eos/sharpness": 46.93765640258788, "eos/L0_probe": 2.2923309803009033, "eos/L_plus": 2.522550106048584, "eos/L_minus": 2.5314884185791016, "eos/grad_norm": 0.14093047380447388, "eos/embed_grad_frac": 0.10599268227815628, "eos/time_s": 0.6391067504882812} {"step": 16050, "timestamp": 1778343062.2618604, "train/loss": 2.325763773918152, "train/z_loss": 0.0013825422385707498, "train/perplexity": 10.234493940939341, "train/grad_norm": 0.140625, "optim/muon_lr": 0.01689602553844452, "optim/adamw_lr": 0.0005068807661533356, "perf/tokens_per_sec": 1906851.8411099338, "perf/iters_per_sec": 0.9092578130292577, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0997980833053589, "data/tokens_consumed": 33661386752, "data/tokens_consumed_B": 33.661386752, "train/loss_slope": -1.1242714852425447e-05} {"step": 16050, "timestamp": 1778343063.628483, "geo/rankme_last": 429.2351989746094, "geo/layer_0/stable_rank_q_proj": 20.60457992553711, "geo/layer_0/stable_rank_k_proj": 16.702333450317383, "geo/layer_0/stable_rank_o_proj": 43.776309967041016, "geo/layer_0/stable_rank_gate_proj": 123.56452941894531, "geo/layer_0/stable_rank_down_proj": 57.94588088989258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06480033695697784, "geo/layer_0/attn_entropy_mean": 6.2328033447265625, "geo/layer_0/attn_entropy_std": 0.46890103816986084, "geo/layer_7/stable_rank_q_proj": 41.72629165649414, "geo/layer_7/stable_rank_k_proj": 38.55443572998047, "geo/layer_7/stable_rank_o_proj": 86.87445068359375, "geo/layer_7/stable_rank_gate_proj": 77.03474426269531, "geo/layer_7/stable_rank_down_proj": 144.46974182128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.38807040452957153, "geo/layer_7/attn_entropy_mean": 4.73056697845459, "geo/layer_7/attn_entropy_std": 0.7472261190414429, "geo/layer_14/stable_rank_q_proj": 51.724525451660156, "geo/layer_14/stable_rank_k_proj": 44.77049255371094, "geo/layer_14/stable_rank_o_proj": 42.081966400146484, "geo/layer_14/stable_rank_gate_proj": 71.76974487304688, "geo/layer_14/stable_rank_down_proj": 126.40570068359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3739529252052307, "geo/layer_14/attn_entropy_mean": 5.48361873626709, "geo/layer_14/attn_entropy_std": 0.47898831963539124, "geo/layer_21/stable_rank_q_proj": 38.23793029785156, "geo/layer_21/stable_rank_k_proj": 28.47455406188965, "geo/layer_21/stable_rank_o_proj": 64.23702239990234, "geo/layer_21/stable_rank_gate_proj": 59.40987014770508, "geo/layer_21/stable_rank_down_proj": 48.56782913208008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1331898421049118, "geo/layer_21/attn_entropy_mean": 5.839313507080078, "geo/layer_21/attn_entropy_std": 0.3321864902973175, "geo/layer_27/stable_rank_q_proj": 45.70887756347656, "geo/layer_27/stable_rank_k_proj": 30.390525817871094, "geo/layer_27/stable_rank_o_proj": 106.0897216796875, "geo/layer_27/stable_rank_gate_proj": 68.53545379638672, "geo/layer_27/stable_rank_down_proj": 129.32620239257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1091003492474556, "geo/layer_27/attn_entropy_mean": 4.301346778869629, "geo/layer_27/attn_entropy_std": 0.6990392208099365, "attnres/final_alpha/block_0": 0.2663460373878479, "attnres/block_norm/0": 1.785292625427246, "attnres/final_alpha/block_1": 0.003701248439028859, "attnres/block_norm/1": 50810.09375, "attnres/final_alpha/block_2": 0.008135301992297173, "attnres/block_norm/2": 30225.84765625, "attnres/final_alpha/block_3": 0.010494297370314598, "attnres/block_norm/3": 74651.9375, "attnres/final_alpha/block_4": 0.011749733239412308, "attnres/block_norm/4": 17872.828125, "attnres/final_alpha/block_5": 0.6000552773475647, "attnres/block_norm/5": 7297.802734375, "attnres/final_alpha/block_6": 0.09951809048652649, "attnres/block_norm/6": 49782.421875, "geo/tier1_time_s": 1.3622674942016602, "geo/step": 16050.0, "geo/rankme_slope": 0.00013539306347539016} {"step": 16060, "timestamp": 1778343073.976771, "train/loss": 2.3322620153427125, "train/z_loss": 0.0013613896095193922, "train/perplexity": 10.301216708960842, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.016763695478439332, "optim/adamw_lr": 0.0005029108643531799, "perf/tokens_per_sec": 1790787.538377641, "perf/iters_per_sec": 0.853914040745564, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1710780620574952, "data/tokens_consumed": 33682358272, "data/tokens_consumed_B": 33.682358272, "train/loss_slope": -1.2330271337184645e-05} {"step": 16070, "timestamp": 1778343084.3217857, "train/loss": 2.321774196624756, "train/z_loss": 0.0013826026930473744, "train/perplexity": 10.193743977917249, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.01662914514541626, "optim/adamw_lr": 0.0004988743543624878, "perf/tokens_per_sec": 2028587.7065029016, "perf/iters_per_sec": 0.9673059971346386, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0337990283966065, "data/tokens_consumed": 33703329792, "data/tokens_consumed_B": 33.703329792, "train/loss_slope": -1.4953734142945489e-05} {"step": 16080, "timestamp": 1778343094.6778905, "train/loss": 2.319684290885925, "train/z_loss": 0.0013805146794766189, "train/perplexity": 10.172462260015838, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.016492419242858887, "optim/adamw_lr": 0.0004947725772857666, "perf/tokens_per_sec": 2026022.836971817, "perf/iters_per_sec": 0.9660829720362745, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351077795028687, "data/tokens_consumed": 33724301312, "data/tokens_consumed_B": 33.724301312, "train/loss_slope": -1.652879569039053e-05} {"step": 16090, "timestamp": 1778343105.0369396, "train/loss": 2.31005175113678, "train/z_loss": 0.0013826568727381527, "train/perplexity": 10.074946031432683, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.016353562474250793, "optim/adamw_lr": 0.0004906068742275238, "perf/tokens_per_sec": 2025733.7368379077, "perf/iters_per_sec": 0.965945118349985, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03525550365448, "data/tokens_consumed": 33745272832, "data/tokens_consumed_B": 33.745272832, "train/loss_slope": -1.66820360882447e-05} {"step": 16100, "timestamp": 1778343115.3795807, "grad/layer_0/attn": 0.002838226268067956, "grad/layer_0/mlp": 0.003170242067426443, "grad/layer_0/attn_mlp_ratio": 0.8952711238372011, "grad/layer_4/attn": 0.0018604123033583164, "grad/layer_4/mlp": 0.0026370580308139324, "grad/layer_4/attn_mlp_ratio": 0.7054877864160306, "grad/layer_8/attn": 0.0035153720527887344, "grad/layer_8/mlp": 0.0036053576041013002, "grad/layer_8/attn_mlp_ratio": 0.9750411308120129, "grad/layer_12/attn": 0.0057724760845303535, "grad/layer_12/mlp": 0.007006543222814798, "grad/layer_12/attn_mlp_ratio": 0.8238693202301257, "grad/layer_16/attn": 0.003565984545275569, "grad/layer_16/mlp": 0.004324168432503939, "grad/layer_16/attn_mlp_ratio": 0.824663636135075, "grad/layer_20/attn": 0.0030447004828602076, "grad/layer_20/mlp": 0.005473512224853039, "grad/layer_20/attn_mlp_ratio": 0.5562608252539116, "grad/layer_24/attn": 0.004576601553708315, "grad/layer_24/mlp": 0.007146386429667473, "grad/layer_24/attn_mlp_ratio": 0.6404077829696218, "grad/layer_27/attn": 0.006459690164774656, "grad/layer_27/mlp": 0.006376003846526146, "grad/layer_27/attn_mlp_ratio": 1.0131251829437944} {"step": 16100, "timestamp": 1778343115.3936067, "train/loss": 2.3447259426116944, "train/z_loss": 0.0013791669975034893, "train/perplexity": 10.430413803846198, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.01621261954307556, "optim/adamw_lr": 0.0004863785862922668, "perf/tokens_per_sec": 2025980.651953034, "perf/iters_per_sec": 0.9660628566517993, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351293325424193, "data/tokens_consumed": 33766244352, "data/tokens_consumed_B": 33.766244352, "train/loss_slope": -1.6693517605964343e-05} {"step": 16110, "timestamp": 1778343125.756281, "train/loss": 2.3143287181854246, "train/z_loss": 0.0013689580489881336, "train/perplexity": 10.11812852284632, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.01606963813304901, "optim/adamw_lr": 0.0004820891439914703, "perf/tokens_per_sec": 2024769.3292143203, "perf/iters_per_sec": 0.9654852529594041, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357486009597778, "data/tokens_consumed": 33787215872, "data/tokens_consumed_B": 33.787215872, "train/loss_slope": -1.4349364747952427e-05} {"step": 16120, "timestamp": 1778343136.1192727, "train/loss": 2.30432448387146, "train/z_loss": 0.0013784510781988501, "train/perplexity": 10.017409044951904, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.015924663543701173, "optim/adamw_lr": 0.0004777399063110351, "perf/tokens_per_sec": 2025052.3264690556, "perf/iters_per_sec": 0.9656201965661314, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356038570404054, "data/tokens_consumed": 33808187392, "data/tokens_consumed_B": 33.808187392, "train/loss_slope": -1.5418803223324628e-05} {"step": 16125, "timestamp": 1778343141.8729167, "eos/sharpness": 5.647659301757812, "eos/L0_probe": 2.288170576095581, "eos/L_plus": 2.3276662826538086, "eos/L_minus": 2.3051514625549316, "eos/grad_norm": 0.08174702525138855, "eos/embed_grad_frac": 0.31889674067497253, "eos/time_s": 0.5835061073303223} {"step": 16125, "timestamp": 1778343143.2491539, "geo/rankme_last": 430.2417297363281, "geo/layer_0/stable_rank_q_proj": 20.567424774169922, "geo/layer_0/stable_rank_k_proj": 16.68003273010254, "geo/layer_0/stable_rank_o_proj": 43.71658706665039, "geo/layer_0/stable_rank_gate_proj": 123.39053344726562, "geo/layer_0/stable_rank_down_proj": 57.980323791503906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06375669687986374, "geo/layer_0/attn_entropy_mean": 6.230417251586914, "geo/layer_0/attn_entropy_std": 0.4718746244907379, "geo/layer_7/stable_rank_q_proj": 41.68590545654297, "geo/layer_7/stable_rank_k_proj": 38.62839889526367, "geo/layer_7/stable_rank_o_proj": 86.87067413330078, "geo/layer_7/stable_rank_gate_proj": 77.02859497070312, "geo/layer_7/stable_rank_down_proj": 144.54481506347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4007706940174103, "geo/layer_7/attn_entropy_mean": 4.725427627563477, "geo/layer_7/attn_entropy_std": 0.7552410364151001, "geo/layer_14/stable_rank_q_proj": 51.69749450683594, "geo/layer_14/stable_rank_k_proj": 44.79928207397461, "geo/layer_14/stable_rank_o_proj": 42.045597076416016, "geo/layer_14/stable_rank_gate_proj": 71.66187286376953, "geo/layer_14/stable_rank_down_proj": 126.61917114257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37576061487197876, "geo/layer_14/attn_entropy_mean": 5.528838157653809, "geo/layer_14/attn_entropy_std": 0.4608633518218994, "geo/layer_21/stable_rank_q_proj": 38.314327239990234, "geo/layer_21/stable_rank_k_proj": 28.50229835510254, "geo/layer_21/stable_rank_o_proj": 64.16558837890625, "geo/layer_21/stable_rank_gate_proj": 59.396175384521484, "geo/layer_21/stable_rank_down_proj": 48.624755859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13547401130199432, "geo/layer_21/attn_entropy_mean": 5.853192329406738, "geo/layer_21/attn_entropy_std": 0.32706552743911743, "geo/layer_27/stable_rank_q_proj": 45.62437438964844, "geo/layer_27/stable_rank_k_proj": 30.40670394897461, "geo/layer_27/stable_rank_o_proj": 106.08529663085938, "geo/layer_27/stable_rank_gate_proj": 68.53160858154297, "geo/layer_27/stable_rank_down_proj": 129.4202117919922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0930449590086937, "geo/layer_27/attn_entropy_mean": 4.304576396942139, "geo/layer_27/attn_entropy_std": 0.7130310535430908, "attnres/final_alpha/block_0": 0.26370447874069214, "attnres/block_norm/0": 1.785395860671997, "attnres/final_alpha/block_1": 0.003729431889951229, "attnres/block_norm/1": 50638.0390625, "attnres/final_alpha/block_2": 0.007978418841958046, "attnres/block_norm/2": 30238.2890625, "attnres/final_alpha/block_3": 0.01030591782182455, "attnres/block_norm/3": 74138.09375, "attnres/final_alpha/block_4": 0.011696201749145985, "attnres/block_norm/4": 17840.009765625, "attnres/final_alpha/block_5": 0.6038293242454529, "attnres/block_norm/5": 7256.958984375, "attnres/final_alpha/block_6": 0.09875625371932983, "attnres/block_norm/6": 49757.9140625, "geo/tier1_time_s": 1.3585119247436523, "geo/step": 16125.0, "geo/rankme_slope": 0.00011868006968412365} {"step": 16130, "timestamp": 1778343148.4269269, "train/loss": 2.2795902490615845, "train/z_loss": 0.0013868148671463132, "train/perplexity": 9.772675226176368, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.01577774465084076, "optim/adamw_lr": 0.00047333233952522274, "perf/tokens_per_sec": 1704749.4391175895, "perf/iters_per_sec": 0.8128878779972026, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2301819562911986, "data/tokens_consumed": 33829158912, "data/tokens_consumed_B": 33.829158912, "train/loss_slope": -1.643558632959089e-05} {"step": 16140, "timestamp": 1778343158.7813566, "train/loss": 2.2763946056365967, "train/z_loss": 0.0013746333890594542, "train/perplexity": 9.74149508768223, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.015628929138183593, "optim/adamw_lr": 0.00046886787414550775, "perf/tokens_per_sec": 2026312.4397469836, "perf/iters_per_sec": 0.9662210654005926, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349598407745362, "data/tokens_consumed": 33850130432, "data/tokens_consumed_B": 33.850130432, "train/loss_slope": -1.7594364331071152e-05} {"step": 16150, "timestamp": 1778343169.1358454, "grad/layer_0/attn": 0.0033292316365987062, "grad/layer_0/mlp": 0.003543923841789365, "grad/layer_0/attn_mlp_ratio": 0.9394196069901382, "grad/layer_4/attn": 0.001848055748268962, "grad/layer_4/mlp": 0.0026580500416457653, "grad/layer_4/attn_mlp_ratio": 0.6952674516232846, "grad/layer_8/attn": 0.0027702583465725183, "grad/layer_8/mlp": 0.0033658139873296022, "grad/layer_8/attn_mlp_ratio": 0.823057446042841, "grad/layer_12/attn": 0.007134119048714638, "grad/layer_12/mlp": 0.006676509510725737, "grad/layer_12/attn_mlp_ratio": 1.06854022006555, "grad/layer_16/attn": 0.0034069560933858156, "grad/layer_16/mlp": 0.004054246935993433, "grad/layer_16/attn_mlp_ratio": 0.8403425008735296, "grad/layer_20/attn": 0.0032509020529687405, "grad/layer_20/mlp": 0.005118590779602528, "grad/layer_20/attn_mlp_ratio": 0.6351166032674176, "grad/layer_24/attn": 0.004809328820556402, "grad/layer_24/mlp": 0.007228249218314886, "grad/layer_24/attn_mlp_ratio": 0.6653518173993472, "grad/layer_27/attn": 0.00394444540143013, "grad/layer_27/mlp": 0.006665125954896212, "grad/layer_27/attn_mlp_ratio": 0.5918035711466454} {"step": 16150, "timestamp": 1778343169.1499357, "train/loss": 2.3319766759872436, "train/z_loss": 0.0013733585714362563, "train/perplexity": 10.298277785739732, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.015478267073631287, "optim/adamw_lr": 0.00046434801220893857, "perf/tokens_per_sec": 2024042.7369039194, "perf/iters_per_sec": 0.9651387867469403, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361204147338867, "data/tokens_consumed": 33871101952, "data/tokens_consumed_B": 33.871101952, "train/loss_slope": -1.68911965516391e-05} {"step": 16160, "timestamp": 1778343179.501826, "train/loss": 2.338068056106567, "train/z_loss": 0.0013727803714573383, "train/perplexity": 10.36119995717913, "train/grad_norm": 0.078125, "optim/muon_lr": 0.015325806140899659, "optim/adamw_lr": 0.0004597741842269897, "perf/tokens_per_sec": 2026879.236537324, "perf/iters_per_sec": 0.9664913351713772, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346704244613647, "data/tokens_consumed": 33892073472, "data/tokens_consumed_B": 33.892073472, "train/loss_slope": -1.689616282566367e-05} {"step": 16170, "timestamp": 1778343189.8643878, "train/loss": 2.30204496383667, "train/z_loss": 0.0013700048672035336, "train/perplexity": 9.994600166861183, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.01517159640789032, "optim/adamw_lr": 0.0004551478922367096, "perf/tokens_per_sec": 2024665.678131418, "perf/iters_per_sec": 0.9654358282715884, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03580162525177, "data/tokens_consumed": 33913044992, "data/tokens_consumed_B": 33.913044992, "train/loss_slope": -1.643969744417069e-05} {"step": 16180, "timestamp": 1778343200.2238874, "train/loss": 2.278575038909912, "train/z_loss": 0.0013800847227685153, "train/perplexity": 9.762758941485886, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.015015689730644226, "optim/adamw_lr": 0.00045047069191932675, "perf/tokens_per_sec": 2025869.924636409, "perf/iters_per_sec": 0.9660100577528042, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351859092712403, "data/tokens_consumed": 33934016512, "data/tokens_consumed_B": 33.934016512, "train/loss_slope": -1.7187986422543623e-05} {"step": 16190, "timestamp": 1778343210.5769777, "train/loss": 2.276932787895203, "train/z_loss": 0.001374101301189512, "train/perplexity": 9.746739198527854, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.014858136177062988, "optim/adamw_lr": 0.0004457440853118896, "perf/tokens_per_sec": 2026575.3708743465, "perf/iters_per_sec": 0.9663464407321675, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348255634307861, "data/tokens_consumed": 33954988032, "data/tokens_consumed_B": 33.954988032, "train/loss_slope": -1.898120169950038e-05} {"step": 16200, "timestamp": 1778343220.9363942, "grad/layer_0/attn": 0.003193936077877879, "grad/layer_0/mlp": 0.00327483844012022, "grad/layer_0/attn_mlp_ratio": 0.97529574015598, "grad/layer_4/attn": 0.002146498067304492, "grad/layer_4/mlp": 0.002555110724642873, "grad/layer_4/attn_mlp_ratio": 0.8400802214144694, "grad/layer_8/attn": 0.0037434410769492388, "grad/layer_8/mlp": 0.003324666526168585, "grad/layer_8/attn_mlp_ratio": 1.12595983232859, "grad/layer_12/attn": 0.007284766994416714, "grad/layer_12/mlp": 0.007305072620511055, "grad/layer_12/attn_mlp_ratio": 0.9972203252628373, "grad/layer_16/attn": 0.0031043547205626965, "grad/layer_16/mlp": 0.004094322212040424, "grad/layer_16/attn_mlp_ratio": 0.7582096581486832, "grad/layer_20/attn": 0.002529943361878395, "grad/layer_20/mlp": 0.005239325109869242, "grad/layer_20/attn_mlp_ratio": 0.482875801851883, "grad/layer_24/attn": 0.004274966660887003, "grad/layer_24/mlp": 0.006935387849807739, "grad/layer_24/attn_mlp_ratio": 0.616399066905192, "grad/layer_27/attn": 0.00552619993686676, "grad/layer_27/mlp": 0.006253359839320183, "grad/layer_27/attn_mlp_ratio": 0.8837169122664522} {"step": 16200, "timestamp": 1778343221.5220132, "eos/sharpness": 25.487971305847164, "eos/L0_probe": 2.284555196762085, "eos/L_plus": 2.3998568058013916, "eos/L_minus": 2.42413330078125, "eos/grad_norm": 0.09495409578084946, "eos/embed_grad_frac": 0.23662957549095154, "eos/time_s": 0.5828471183776855} {"step": 16200, "timestamp": 1778343221.5416284, "train/loss": 2.314331126213074, "train/z_loss": 0.0013737360946834087, "train/perplexity": 10.118152887608897, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.014698988795280457, "optim/adamw_lr": 0.00044096966385841365, "perf/tokens_per_sec": 1913484.3228660438, "perf/iters_per_sec": 0.9124204267816752, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0959859848022462, "data/tokens_consumed": 33975959552, "data/tokens_consumed_B": 33.975959552, "train/loss_slope": -2.0254922363803136e-05} {"step": 16200, "timestamp": 1778343222.9089947, "geo/rankme_last": 429.5175476074219, "geo/layer_0/stable_rank_q_proj": 20.540863037109375, "geo/layer_0/stable_rank_k_proj": 16.69572639465332, "geo/layer_0/stable_rank_o_proj": 43.69859313964844, "geo/layer_0/stable_rank_gate_proj": 123.44352722167969, "geo/layer_0/stable_rank_down_proj": 57.97614669799805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06113826856017113, "geo/layer_0/attn_entropy_mean": 6.228898048400879, "geo/layer_0/attn_entropy_std": 0.4669639766216278, "geo/layer_7/stable_rank_q_proj": 41.69070816040039, "geo/layer_7/stable_rank_k_proj": 38.71121597290039, "geo/layer_7/stable_rank_o_proj": 86.71442413330078, "geo/layer_7/stable_rank_gate_proj": 76.99779510498047, "geo/layer_7/stable_rank_down_proj": 144.23992919921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3999446928501129, "geo/layer_7/attn_entropy_mean": 4.756991863250732, "geo/layer_7/attn_entropy_std": 0.7581555247306824, "geo/layer_14/stable_rank_q_proj": 51.694766998291016, "geo/layer_14/stable_rank_k_proj": 44.667903900146484, "geo/layer_14/stable_rank_o_proj": 42.11943817138672, "geo/layer_14/stable_rank_gate_proj": 71.64237213134766, "geo/layer_14/stable_rank_down_proj": 126.62109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37170785665512085, "geo/layer_14/attn_entropy_mean": 5.488603591918945, "geo/layer_14/attn_entropy_std": 0.4530611038208008, "geo/layer_21/stable_rank_q_proj": 38.3383674621582, "geo/layer_21/stable_rank_k_proj": 28.53888511657715, "geo/layer_21/stable_rank_o_proj": 64.15324401855469, "geo/layer_21/stable_rank_gate_proj": 59.330875396728516, "geo/layer_21/stable_rank_down_proj": 48.66123580932617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13715393841266632, "geo/layer_21/attn_entropy_mean": 5.8416290283203125, "geo/layer_21/attn_entropy_std": 0.3326529562473297, "geo/layer_27/stable_rank_q_proj": 45.617942810058594, "geo/layer_27/stable_rank_k_proj": 30.366525650024414, "geo/layer_27/stable_rank_o_proj": 106.20198822021484, "geo/layer_27/stable_rank_gate_proj": 68.52972412109375, "geo/layer_27/stable_rank_down_proj": 129.46310424804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09774509817361832, "geo/layer_27/attn_entropy_mean": 4.2962236404418945, "geo/layer_27/attn_entropy_std": 0.7198547124862671, "attnres/final_alpha/block_0": 0.2647709250450134, "attnres/block_norm/0": 1.7855768203735352, "attnres/final_alpha/block_1": 0.003715797094628215, "attnres/block_norm/1": 50593.30859375, "attnres/final_alpha/block_2": 0.007893179543316364, "attnres/block_norm/2": 30206.1328125, "attnres/final_alpha/block_3": 0.010051347315311432, "attnres/block_norm/3": 74782.1875, "attnres/final_alpha/block_4": 0.01145925559103489, "attnres/block_norm/4": 17796.47265625, "attnres/final_alpha/block_5": 0.6033412218093872, "attnres/block_norm/5": 7281.6015625, "attnres/final_alpha/block_6": 0.09876823425292969, "attnres/block_norm/6": 49812.5, "geo/tier1_time_s": 1.3633067607879639, "geo/step": 16200.0, "geo/rankme_slope": 0.00010324489170668267} {"step": 16210, "timestamp": 1778343233.272994, "train/loss": 2.3195736169815064, "train/z_loss": 0.001379396626725793, "train/perplexity": 10.171336496197457, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.014538299143314361, "optim/adamw_lr": 0.0004361489742994308, "perf/tokens_per_sec": 1788234.0974960697, "perf/iters_per_sec": 0.8526964652519559, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1727502584457397, "data/tokens_consumed": 33996931072, "data/tokens_consumed_B": 33.996931072, "train/loss_slope": -2.1757341811794992e-05} {"step": 16220, "timestamp": 1778343243.626504, "train/loss": 2.250540852546692, "train/z_loss": 0.0013640568708069623, "train/perplexity": 9.492868690381423, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.014376118779182434, "optim/adamw_lr": 0.000431283563375473, "perf/tokens_per_sec": 2026500.2474022838, "perf/iters_per_sec": 0.9663106190692348, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348639249801637, "data/tokens_consumed": 34017902592, "data/tokens_consumed_B": 34.017902592, "train/loss_slope": -2.4929329700166665e-05} {"step": 16230, "timestamp": 1778343253.9806788, "train/loss": 2.3107700109481812, "train/z_loss": 0.0013679015100933611, "train/perplexity": 10.082185059709424, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.014212503135204316, "optim/adamw_lr": 0.0004263750940561294, "perf/tokens_per_sec": 2026287.8868077246, "perf/iters_per_sec": 0.9662093576468108, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349723815917968, "data/tokens_consumed": 34038874112, "data/tokens_consumed_B": 34.038874112, "train/loss_slope": -2.5766126779761697e-05} {"step": 16240, "timestamp": 1778343264.3362646, "train/loss": 2.316707944869995, "train/z_loss": 0.0013821814907714724, "train/perplexity": 10.142230504895666, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.014047504663467408, "optim/adamw_lr": 0.0004214251399040222, "perf/tokens_per_sec": 2026201.8628577606, "perf/iters_per_sec": 0.9661683382309726, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350163221359252, "data/tokens_consumed": 34059845632, "data/tokens_consumed_B": 34.059845632, "train/loss_slope": -2.8309036238287205e-05} {"step": 16250, "timestamp": 1778343274.6938777, "grad/layer_0/attn": 0.0025040286127477884, "grad/layer_0/mlp": 0.0029611431527882814, "grad/layer_0/attn_mlp_ratio": 0.8456290017005887, "grad/layer_4/attn": 0.0021796678192913532, "grad/layer_4/mlp": 0.002443179953843355, "grad/layer_4/attn_mlp_ratio": 0.8921437516905592, "grad/layer_8/attn": 0.0028346918988972902, "grad/layer_8/mlp": 0.003132044803351164, "grad/layer_8/attn_mlp_ratio": 0.9050610659713985, "grad/layer_12/attn": 0.005550919566303492, "grad/layer_12/mlp": 0.006680052727460861, "grad/layer_12/attn_mlp_ratio": 0.8309694114220708, "grad/layer_16/attn": 0.0031347223557531834, "grad/layer_16/mlp": 0.004016288556158543, "grad/layer_16/attn_mlp_ratio": 0.7805022557196997, "grad/layer_20/attn": 0.002615493256598711, "grad/layer_20/mlp": 0.0046470132656395435, "grad/layer_20/attn_mlp_ratio": 0.5628331684900921, "grad/layer_24/attn": 0.003737441264092922, "grad/layer_24/mlp": 0.006668995600193739, "grad/layer_24/attn_mlp_ratio": 0.560420403927438, "grad/layer_27/attn": 0.008037686347961426, "grad/layer_27/mlp": 0.005967079661786556, "grad/layer_27/attn_mlp_ratio": 1.3470050123068786} {"step": 16250, "timestamp": 1778343274.7079055, "train/loss": 2.3184911012649536, "train/z_loss": 0.0013845350360497833, "train/perplexity": 10.160331822022993, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.01388117641210556, "optim/adamw_lr": 0.00041643529236316676, "perf/tokens_per_sec": 2022839.6376358904, "perf/iters_per_sec": 0.9645651043109371, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0367366552352906, "data/tokens_consumed": 34080817152, "data/tokens_consumed_B": 34.080817152, "train/loss_slope": -2.5664949159596436e-05} {"step": 16260, "timestamp": 1778343285.0676425, "train/loss": 2.3216652870178223, "train/z_loss": 0.0013733498170040547, "train/perplexity": 10.192633841720776, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.013713575601577759, "optim/adamw_lr": 0.00041140726804733273, "perf/tokens_per_sec": 2025182.361117094, "perf/iters_per_sec": 0.96568220191817, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355373620986938, "data/tokens_consumed": 34101788672, "data/tokens_consumed_B": 34.101788672, "train/loss_slope": -2.409118125290616e-05} {"step": 16270, "timestamp": 1778343295.4236524, "train/loss": 2.3095550298690797, "train/z_loss": 0.0013675746624357999, "train/perplexity": 10.069942834168051, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.013544755876064301, "optim/adamw_lr": 0.00040634267628192897, "perf/tokens_per_sec": 2026239.0627688407, "perf/iters_per_sec": 0.9661860765308574, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034997320175171, "data/tokens_consumed": 34122760192, "data/tokens_consumed_B": 34.122760192, "train/loss_slope": -2.4276131235941567e-05} {"step": 16275, "timestamp": 1778343301.1843355, "eos/sharpness": 8.668375015258787, "eos/L0_probe": 2.2806527614593506, "eos/L_plus": 2.3331336975097656, "eos/L_minus": 2.3148555755615234, "eos/grad_norm": 0.08572234958410263, "eos/embed_grad_frac": 0.27540433406829834, "eos/time_s": 0.5815541744232178} {"step": 16275, "timestamp": 1778343302.5656393, "geo/rankme_last": 430.63336181640625, "geo/layer_0/stable_rank_q_proj": 20.53485870361328, "geo/layer_0/stable_rank_k_proj": 16.676254272460938, "geo/layer_0/stable_rank_o_proj": 43.68230056762695, "geo/layer_0/stable_rank_gate_proj": 123.28890991210938, "geo/layer_0/stable_rank_down_proj": 57.92745590209961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0676506906747818, "geo/layer_0/attn_entropy_mean": 6.224886894226074, "geo/layer_0/attn_entropy_std": 0.4695870876312256, "geo/layer_7/stable_rank_q_proj": 41.68590545654297, "geo/layer_7/stable_rank_k_proj": 38.76271057128906, "geo/layer_7/stable_rank_o_proj": 86.7481460571289, "geo/layer_7/stable_rank_gate_proj": 76.94384002685547, "geo/layer_7/stable_rank_down_proj": 144.40565490722656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39606571197509766, "geo/layer_7/attn_entropy_mean": 4.741695880889893, "geo/layer_7/attn_entropy_std": 0.7430000901222229, "geo/layer_14/stable_rank_q_proj": 51.641845703125, "geo/layer_14/stable_rank_k_proj": 44.66899108886719, "geo/layer_14/stable_rank_o_proj": 42.161712646484375, "geo/layer_14/stable_rank_gate_proj": 71.67009735107422, "geo/layer_14/stable_rank_down_proj": 126.56365203857422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3740891218185425, "geo/layer_14/attn_entropy_mean": 5.497526168823242, "geo/layer_14/attn_entropy_std": 0.4615325629711151, "geo/layer_21/stable_rank_q_proj": 38.3702278137207, "geo/layer_21/stable_rank_k_proj": 28.507755279541016, "geo/layer_21/stable_rank_o_proj": 64.14674377441406, "geo/layer_21/stable_rank_gate_proj": 59.3151969909668, "geo/layer_21/stable_rank_down_proj": 48.692352294921875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13701078295707703, "geo/layer_21/attn_entropy_mean": 5.84161376953125, "geo/layer_21/attn_entropy_std": 0.33702778816223145, "geo/layer_27/stable_rank_q_proj": 45.554622650146484, "geo/layer_27/stable_rank_k_proj": 30.38892364501953, "geo/layer_27/stable_rank_o_proj": 106.34857940673828, "geo/layer_27/stable_rank_gate_proj": 68.50263214111328, "geo/layer_27/stable_rank_down_proj": 129.35609436035156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09876054525375366, "geo/layer_27/attn_entropy_mean": 4.272492408752441, "geo/layer_27/attn_entropy_std": 0.6926149725914001, "attnres/final_alpha/block_0": 0.2641528844833374, "attnres/block_norm/0": 1.7856849431991577, "attnres/final_alpha/block_1": 0.003740315791219473, "attnres/block_norm/1": 50712.875, "attnres/final_alpha/block_2": 0.007927973754703999, "attnres/block_norm/2": 30354.8515625, "attnres/final_alpha/block_3": 0.010113725438714027, "attnres/block_norm/3": 74580.328125, "attnres/final_alpha/block_4": 0.011541549116373062, "attnres/block_norm/4": 17786.81640625, "attnres/final_alpha/block_5": 0.603145956993103, "attnres/block_norm/5": 7269.7490234375, "attnres/final_alpha/block_6": 0.09937752038240433, "attnres/block_norm/6": 49841.10546875, "geo/tier1_time_s": 1.3616406917572021, "geo/step": 16275.0, "geo/rankme_slope": 0.00016501362263655463} {"step": 16280, "timestamp": 1778343307.7436764, "train/loss": 2.2753527879714968, "train/z_loss": 0.0013811783166602254, "train/perplexity": 9.731351510811658, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.013374771177768708, "optim/adamw_lr": 0.0004012431353330612, "perf/tokens_per_sec": 1702894.9124593304, "perf/iters_per_sec": 0.8120035707756664, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2315216779708862, "data/tokens_consumed": 34143731712, "data/tokens_consumed_B": 34.143731712, "train/loss_slope": -2.7686688079513484e-05} {"step": 16290, "timestamp": 1778343318.1089585, "train/loss": 2.2962101459503175, "train/z_loss": 0.0013805285911075772, "train/perplexity": 9.936453298203181, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.013203680217266082, "optim/adamw_lr": 0.00039611040651798244, "perf/tokens_per_sec": 2024173.713277491, "perf/iters_per_sec": 0.965201241148706, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0360533714294433, "data/tokens_consumed": 34164703232, "data/tokens_consumed_B": 34.164703232, "train/loss_slope": -3.07266856684829e-05} {"step": 16300, "timestamp": 1778343328.4508116, "grad/layer_0/attn": 0.0026223857421427965, "grad/layer_0/mlp": 0.002884529298171401, "grad/layer_0/attn_mlp_ratio": 0.9091208235926505, "grad/layer_4/attn": 0.0018287207931280136, "grad/layer_4/mlp": 0.002558088395744562, "grad/layer_4/attn_mlp_ratio": 0.7148778457704383, "grad/layer_8/attn": 0.006104591768234968, "grad/layer_8/mlp": 0.0033561703749001026, "grad/layer_8/attn_mlp_ratio": 1.8189158786448931, "grad/layer_12/attn": 0.006327013950794935, "grad/layer_12/mlp": 0.007257991470396519, "grad/layer_12/attn_mlp_ratio": 0.8717306832652161, "grad/layer_16/attn": 0.007463687565177679, "grad/layer_16/mlp": 0.00416058162227273, "grad/layer_16/attn_mlp_ratio": 1.7939048102871102, "grad/layer_20/attn": 0.0029002532828599215, "grad/layer_20/mlp": 0.0050172945484519005, "grad/layer_20/attn_mlp_ratio": 0.5780512180513063, "grad/layer_24/attn": 0.006189756095409393, "grad/layer_24/mlp": 0.0067148273810744286, "grad/layer_24/attn_mlp_ratio": 0.921804188246841, "grad/layer_27/attn": 0.006012273486703634, "grad/layer_27/mlp": 0.006493855267763138, "grad/layer_27/attn_mlp_ratio": 0.9258403746639975} {"step": 16300, "timestamp": 1778343328.464939, "train/loss": 2.306516003608704, "train/z_loss": 0.0013829529052600265, "train/perplexity": 10.03938646777188, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.013031537532806397, "optim/adamw_lr": 0.00039094612598419186, "perf/tokens_per_sec": 2026161.490597798, "perf/iters_per_sec": 0.9661490872372618, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350369453430175, "data/tokens_consumed": 34185674752, "data/tokens_consumed_B": 34.185674752, "train/loss_slope": -3.2296551078638415e-05} {"step": 16310, "timestamp": 1778343338.8229527, "train/loss": 2.2695908546447754, "train/z_loss": 0.0013718822621740401, "train/perplexity": 9.675441342243394, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.01285839855670929, "optim/adamw_lr": 0.0003857519567012787, "perf/tokens_per_sec": 2025635.5847368452, "perf/iters_per_sec": 0.9658983157810427, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035305666923523, "data/tokens_consumed": 34206646272, "data/tokens_consumed_B": 34.206646272, "train/loss_slope": -3.348008601328575e-05} {"step": 16320, "timestamp": 1778343349.1763017, "train/loss": 2.2580726861953737, "train/z_loss": 0.0013860545703209936, "train/perplexity": 9.5646373336703, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.012684322595596313, "optim/adamw_lr": 0.00038052967786788937, "perf/tokens_per_sec": 2026826.0873280445, "perf/iters_per_sec": 0.9664659916534636, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346975564956664, "data/tokens_consumed": 34227617792, "data/tokens_consumed_B": 34.227617792, "train/loss_slope": -3.702226763833147e-05} {"step": 16330, "timestamp": 1778343359.5303986, "train/loss": 2.3039160490036013, "train/z_loss": 0.0013858356745913626, "train/perplexity": 10.013318421245884, "train/grad_norm": 0.09375, "optim/muon_lr": 0.012509364485740661, "optim/adamw_lr": 0.0003752809345722198, "perf/tokens_per_sec": 2026732.3590915173, "perf/iters_per_sec": 0.9664212985475146, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347454071044921, "data/tokens_consumed": 34248589312, "data/tokens_consumed_B": 34.248589312, "train/loss_slope": -3.778137928939526e-05} {"step": 16340, "timestamp": 1778343369.8832488, "train/loss": 2.3570650577545167, "train/z_loss": 0.0013771882513538003, "train/perplexity": 10.559913191627603, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.012333583682775498, "optim/adamw_lr": 0.00037000751048326487, "perf/tokens_per_sec": 2026582.7948138185, "perf/iters_per_sec": 0.966349980742368, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348217725753783, "data/tokens_consumed": 34269560832, "data/tokens_consumed_B": 34.269560832, "train/loss_slope": -3.549486673024902e-05} {"step": 16350, "timestamp": 1778343380.2302194, "grad/layer_0/attn": 0.0033057439140975475, "grad/layer_0/mlp": 0.003817107295617461, "grad/layer_0/attn_mlp_ratio": 0.8660337715132076, "grad/layer_4/attn": 0.002217091154307127, "grad/layer_4/mlp": 0.00274440529756248, "grad/layer_4/attn_mlp_ratio": 0.8078584731965249, "grad/layer_8/attn": 0.0036939322017133236, "grad/layer_8/mlp": 0.003404879244044423, "grad/layer_8/attn_mlp_ratio": 1.084893715301393, "grad/layer_12/attn": 0.012708151713013649, "grad/layer_12/mlp": 0.007827653549611568, "grad/layer_12/attn_mlp_ratio": 1.6234943805471345, "grad/layer_16/attn": 0.003561947960406542, "grad/layer_16/mlp": 0.004498350899666548, "grad/layer_16/attn_mlp_ratio": 0.7918341544869623, "grad/layer_20/attn": 0.0026877140626311302, "grad/layer_20/mlp": 0.005478624254465103, "grad/layer_20/attn_mlp_ratio": 0.4905819214344622, "grad/layer_24/attn": 0.008242284879088402, "grad/layer_24/mlp": 0.008944755420088768, "grad/layer_24/attn_mlp_ratio": 0.9214656410202932, "grad/layer_27/attn": 0.006691859569400549, "grad/layer_27/mlp": 0.008965929970145226, "grad/layer_27/attn_mlp_ratio": 0.7463653538502513} {"step": 16350, "timestamp": 1778343380.8151214, "eos/sharpness": 41.85845851898193, "eos/L0_probe": 2.2741079330444336, "eos/L_plus": 2.5004169940948486, "eos/L_minus": 2.466383457183838, "eos/grad_norm": 0.12604689598083496, "eos/embed_grad_frac": 0.13788391649723053, "eos/time_s": 0.5821046829223633} {"step": 16350, "timestamp": 1778343380.8342369, "train/loss": 2.265703296661377, "train/z_loss": 0.001383058854844421, "train/perplexity": 9.637900521349296, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.01215703696012497, "optim/adamw_lr": 0.00036471110880374904, "perf/tokens_per_sec": 1916165.262802467, "perf/iters_per_sec": 0.9136987985622725, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0944525718688964, "data/tokens_consumed": 34290532352, "data/tokens_consumed_B": 34.290532352, "train/loss_slope": -3.473519446289716e-05} {"step": 16350, "timestamp": 1778343382.1974573, "geo/rankme_last": 430.08709716796875, "geo/layer_0/stable_rank_q_proj": 20.524988174438477, "geo/layer_0/stable_rank_k_proj": 16.652616500854492, "geo/layer_0/stable_rank_o_proj": 43.68580627441406, "geo/layer_0/stable_rank_gate_proj": 123.25077056884766, "geo/layer_0/stable_rank_down_proj": 57.92485427856445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06756591796875, "geo/layer_0/attn_entropy_mean": 6.224094867706299, "geo/layer_0/attn_entropy_std": 0.4693264961242676, "geo/layer_7/stable_rank_q_proj": 41.66668701171875, "geo/layer_7/stable_rank_k_proj": 38.81666564941406, "geo/layer_7/stable_rank_o_proj": 86.84713745117188, "geo/layer_7/stable_rank_gate_proj": 76.9148941040039, "geo/layer_7/stable_rank_down_proj": 144.5303955078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3998810648918152, "geo/layer_7/attn_entropy_mean": 4.754345893859863, "geo/layer_7/attn_entropy_std": 0.7385231852531433, "geo/layer_14/stable_rank_q_proj": 51.57544708251953, "geo/layer_14/stable_rank_k_proj": 44.66239547729492, "geo/layer_14/stable_rank_o_proj": 42.1569709777832, "geo/layer_14/stable_rank_gate_proj": 71.68861389160156, "geo/layer_14/stable_rank_down_proj": 126.52436065673828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36694079637527466, "geo/layer_14/attn_entropy_mean": 5.505649566650391, "geo/layer_14/attn_entropy_std": 0.47164157032966614, "geo/layer_21/stable_rank_q_proj": 38.279823303222656, "geo/layer_21/stable_rank_k_proj": 28.472105026245117, "geo/layer_21/stable_rank_o_proj": 64.15331268310547, "geo/layer_21/stable_rank_gate_proj": 59.272972106933594, "geo/layer_21/stable_rank_down_proj": 48.708770751953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13644595444202423, "geo/layer_21/attn_entropy_mean": 5.861174583435059, "geo/layer_21/attn_entropy_std": 0.32921791076660156, "geo/layer_27/stable_rank_q_proj": 45.509735107421875, "geo/layer_27/stable_rank_k_proj": 30.397897720336914, "geo/layer_27/stable_rank_o_proj": 106.41644287109375, "geo/layer_27/stable_rank_gate_proj": 68.45159912109375, "geo/layer_27/stable_rank_down_proj": 129.21360778808594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08905176818370819, "geo/layer_27/attn_entropy_mean": 4.284061908721924, "geo/layer_27/attn_entropy_std": 0.7046843767166138, "attnres/final_alpha/block_0": 0.26267048716545105, "attnres/block_norm/0": 1.785696268081665, "attnres/final_alpha/block_1": 0.0036882509011775255, "attnres/block_norm/1": 50467.82421875, "attnres/final_alpha/block_2": 0.007790686562657356, "attnres/block_norm/2": 30194.09375, "attnres/final_alpha/block_3": 0.009857067838311195, "attnres/block_norm/3": 75036.09375, "attnres/final_alpha/block_4": 0.01126781478524208, "attnres/block_norm/4": 17778.056640625, "attnres/final_alpha/block_5": 0.606918454170227, "attnres/block_norm/5": 7249.0078125, "attnres/final_alpha/block_6": 0.09780722856521606, "attnres/block_norm/6": 49760.6796875, "geo/tier1_time_s": 1.3599460124969482, "geo/step": 16350.0, "geo/rankme_slope": 0.00015031833045718286} {"step": 16360, "timestamp": 1778343392.5492094, "train/loss": 2.288975739479065, "train/z_loss": 0.0013857606332749127, "train/perplexity": 9.86482835053761, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.011979780942201614, "optim/adamw_lr": 0.0003593934282660484, "perf/tokens_per_sec": 1790634.4988446499, "perf/iters_per_sec": 0.8538410658095598, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.171178150177002, "data/tokens_consumed": 34311503872, "data/tokens_consumed_B": 34.311503872, "train/loss_slope": -3.555005736226542e-05} {"step": 16370, "timestamp": 1778343402.9061918, "train/loss": 2.3301936388015747, "train/z_loss": 0.0013757633860222995, "train/perplexity": 10.279931934027772, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.011801876425743103, "optim/adamw_lr": 0.00035405629277229306, "perf/tokens_per_sec": 2025785.0092315492, "perf/iters_per_sec": 0.9659695669324633, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352293014526368, "data/tokens_consumed": 34332475392, "data/tokens_consumed_B": 34.332475392, "train/loss_slope": -3.521057577273378e-05} {"step": 16380, "timestamp": 1778343413.263821, "train/loss": 2.31224844455719, "train/z_loss": 0.0013730299891903997, "train/perplexity": 10.097101925034398, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.011623380482196808, "optim/adamw_lr": 0.00034870141446590423, "perf/tokens_per_sec": 2025744.466969418, "perf/iters_per_sec": 0.9659502348754015, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352500200271606, "data/tokens_consumed": 34353446912, "data/tokens_consumed_B": 34.353446912, "train/loss_slope": -3.437225128343212e-05} {"step": 16390, "timestamp": 1778343423.6165023, "train/loss": 2.2675791025161742, "train/z_loss": 0.001380430208519101, "train/perplexity": 9.655996318370136, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.011444350332021713, "optim/adamw_lr": 0.0003433305099606514, "perf/tokens_per_sec": 2026571.5888883583, "perf/iters_per_sec": 0.9663446373407165, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348274946212768, "data/tokens_consumed": 34374418432, "data/tokens_consumed_B": 34.374418432, "train/loss_slope": -3.658010370910896e-05} {"step": 16400, "timestamp": 1778343433.9641654, "grad/layer_0/attn": 0.002685749903321266, "grad/layer_0/mlp": 0.003048306331038475, "grad/layer_0/attn_mlp_ratio": 0.8810629653155652, "grad/layer_4/attn": 0.0016968630952760577, "grad/layer_4/mlp": 0.002502874471247196, "grad/layer_4/attn_mlp_ratio": 0.6779656938343904, "grad/layer_8/attn": 0.0033561228774487972, "grad/layer_8/mlp": 0.0031418364960700274, "grad/layer_8/attn_mlp_ratio": 1.0682041458320304, "grad/layer_12/attn": 0.006086628418415785, "grad/layer_12/mlp": 0.006646355148404837, "grad/layer_12/attn_mlp_ratio": 0.9157843947442639, "grad/layer_16/attn": 0.003144538030028343, "grad/layer_16/mlp": 0.004135059658437967, "grad/layer_16/attn_mlp_ratio": 0.760457699217436, "grad/layer_20/attn": 0.002328656380996108, "grad/layer_20/mlp": 0.004636792931705713, "grad/layer_20/attn_mlp_ratio": 0.5022127071605518, "grad/layer_24/attn": 0.004738807678222656, "grad/layer_24/mlp": 0.007489528972655535, "grad/layer_24/attn_mlp_ratio": 0.6327243852386082, "grad/layer_27/attn": 0.003839978016912937, "grad/layer_27/mlp": 0.0063255405984818935, "grad/layer_27/attn_mlp_ratio": 0.6070592538965907} {"step": 16400, "timestamp": 1778343433.9787471, "train/loss": 2.300977349281311, "train/z_loss": 0.0013965170248411596, "train/perplexity": 9.983935480148398, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.01126484751701355, "optim/adamw_lr": 0.00033794542551040646, "perf/tokens_per_sec": 2024799.158830532, "perf/iters_per_sec": 0.9654994768288289, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357333421707153, "data/tokens_consumed": 34395389952, "data/tokens_consumed_B": 34.395389952, "train/loss_slope": -3.353219231148575e-05} {"step": 16410, "timestamp": 1778343444.336083, "train/loss": 2.2671663045883177, "train/z_loss": 0.0013918487005867065, "train/perplexity": 9.652011165686497, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.011084929332137108, "optim/adamw_lr": 0.0003325478799641132, "perf/tokens_per_sec": 2026003.050859302, "perf/iters_per_sec": 0.9660735372826109, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351178884506225, "data/tokens_consumed": 34416361472, "data/tokens_consumed_B": 34.416361472, "train/loss_slope": -3.7512294272563846e-05} {"step": 16420, "timestamp": 1778343454.6895254, "train/loss": 2.2629277229309084, "train/z_loss": 0.0013852943899109959, "train/perplexity": 9.611186907797094, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.010904653891921044, "optim/adamw_lr": 0.0003271396167576313, "perf/tokens_per_sec": 2026510.8922790221, "perf/iters_per_sec": 0.9663156949420081, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.03485848903656, "data/tokens_consumed": 34437332992, "data/tokens_consumed_B": 34.437332992, "train/loss_slope": -3.721083443049657e-05} {"step": 16425, "timestamp": 1778343460.437188, "eos/sharpness": 35.61687469482421, "eos/L0_probe": 2.2686703205108643, "eos/L_plus": 2.4772133827209473, "eos/L_minus": 2.4162960052490234, "eos/grad_norm": 0.1050339788198471, "eos/embed_grad_frac": 0.1660187840461731, "eos/time_s": 0.5821030139923096} {"step": 16425, "timestamp": 1778343461.8146722, "geo/rankme_last": 430.5093688964844, "geo/layer_0/stable_rank_q_proj": 20.5124568939209, "geo/layer_0/stable_rank_k_proj": 16.637155532836914, "geo/layer_0/stable_rank_o_proj": 43.65042495727539, "geo/layer_0/stable_rank_gate_proj": 123.1515884399414, "geo/layer_0/stable_rank_down_proj": 57.90700912475586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0646902471780777, "geo/layer_0/attn_entropy_mean": 6.2244181632995605, "geo/layer_0/attn_entropy_std": 0.4678645431995392, "geo/layer_7/stable_rank_q_proj": 41.624385833740234, "geo/layer_7/stable_rank_k_proj": 38.796817779541016, "geo/layer_7/stable_rank_o_proj": 86.86643981933594, "geo/layer_7/stable_rank_gate_proj": 76.9086685180664, "geo/layer_7/stable_rank_down_proj": 144.31185913085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3930988609790802, "geo/layer_7/attn_entropy_mean": 4.751300811767578, "geo/layer_7/attn_entropy_std": 0.734386146068573, "geo/layer_14/stable_rank_q_proj": 51.57742691040039, "geo/layer_14/stable_rank_k_proj": 44.586204528808594, "geo/layer_14/stable_rank_o_proj": 42.15659713745117, "geo/layer_14/stable_rank_gate_proj": 71.64498901367188, "geo/layer_14/stable_rank_down_proj": 126.5427474975586, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37524935603141785, "geo/layer_14/attn_entropy_mean": 5.475714206695557, "geo/layer_14/attn_entropy_std": 0.47521355748176575, "geo/layer_21/stable_rank_q_proj": 38.26051330566406, "geo/layer_21/stable_rank_k_proj": 28.472637176513672, "geo/layer_21/stable_rank_o_proj": 64.13272857666016, "geo/layer_21/stable_rank_gate_proj": 59.2403678894043, "geo/layer_21/stable_rank_down_proj": 48.72068786621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13672736287117004, "geo/layer_21/attn_entropy_mean": 5.85096549987793, "geo/layer_21/attn_entropy_std": 0.33958980441093445, "geo/layer_27/stable_rank_q_proj": 45.50517272949219, "geo/layer_27/stable_rank_k_proj": 30.417980194091797, "geo/layer_27/stable_rank_o_proj": 106.46635437011719, "geo/layer_27/stable_rank_gate_proj": 68.38733673095703, "geo/layer_27/stable_rank_down_proj": 129.20204162597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09522180259227753, "geo/layer_27/attn_entropy_mean": 4.289555549621582, "geo/layer_27/attn_entropy_std": 0.7073706388473511, "attnres/final_alpha/block_0": 0.26426932215690613, "attnres/block_norm/0": 1.7857067584991455, "attnres/final_alpha/block_1": 0.0036985690239816904, "attnres/block_norm/1": 50645.6796875, "attnres/final_alpha/block_2": 0.007761305198073387, "attnres/block_norm/2": 30285.48828125, "attnres/final_alpha/block_3": 0.009953472763299942, "attnres/block_norm/3": 74625.65625, "attnres/final_alpha/block_4": 0.011457698419690132, "attnres/block_norm/4": 17780.7578125, "attnres/final_alpha/block_5": 0.6035996675491333, "attnres/block_norm/5": 7261.1865234375, "attnres/final_alpha/block_6": 0.09925991296768188, "attnres/block_norm/6": 49761.296875, "geo/tier1_time_s": 1.3582510948181152, "geo/step": 16425.0, "geo/rankme_slope": 0.0001537735406662665} {"step": 16430, "timestamp": 1778343466.9959214, "train/loss": 2.2665639877319337, "train/z_loss": 0.0013828842318616807, "train/perplexity": 9.646199347117246, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.010724082738161087, "optim/adamw_lr": 0.00032172248214483256, "perf/tokens_per_sec": 1704760.210008887, "perf/iters_per_sec": 0.8128930139584003, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.23017418384552, "data/tokens_consumed": 34458304512, "data/tokens_consumed_B": 34.458304512, "train/loss_slope": -3.9418553590703e-05} {"step": 16440, "timestamp": 1778343477.3586695, "train/loss": 2.310678577423096, "train/z_loss": 0.0013842507149092853, "train/perplexity": 10.081263252131551, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.010543273873627187, "optim/adamw_lr": 0.00031629821620881557, "perf/tokens_per_sec": 2025089.810541913, "perf/iters_per_sec": 0.9656380703649106, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355846881866455, "data/tokens_consumed": 34479276032, "data/tokens_consumed_B": 34.479276032, "train/loss_slope": -3.692125610285179e-05} {"step": 16450, "timestamp": 1778343487.7008307, "grad/layer_0/attn": 0.0028826557099819183, "grad/layer_0/mlp": 0.0030816467478871346, "grad/layer_0/attn_mlp_ratio": 0.9354270142791832, "grad/layer_4/attn": 0.001964573049917817, "grad/layer_4/mlp": 0.00255744275636971, "grad/layer_4/attn_mlp_ratio": 0.7681786691830638, "grad/layer_8/attn": 0.004606580827385187, "grad/layer_8/mlp": 0.0032052677124738693, "grad/layer_8/attn_mlp_ratio": 1.4371906177255638, "grad/layer_12/attn": 0.004631999880075455, "grad/layer_12/mlp": 0.006480381824076176, "grad/layer_12/attn_mlp_ratio": 0.7147726683926549, "grad/layer_16/attn": 0.003125646850094199, "grad/layer_16/mlp": 0.004083056468516588, "grad/layer_16/attn_mlp_ratio": 0.7655164207606792, "grad/layer_20/attn": 0.002562869107350707, "grad/layer_20/mlp": 0.0050384411588311195, "grad/layer_20/attn_mlp_ratio": 0.5086630915580573, "grad/layer_24/attn": 0.004195289686322212, "grad/layer_24/mlp": 0.006891618482768536, "grad/layer_24/attn_mlp_ratio": 0.6087524484903892, "grad/layer_27/attn": 0.004989630077034235, "grad/layer_27/mlp": 0.006139911245554686, "grad/layer_27/attn_mlp_ratio": 0.8126550688141031} {"step": 16450, "timestamp": 1778343487.7148771, "train/loss": 2.254947066307068, "train/z_loss": 0.0013844380271621048, "train/perplexity": 9.534788585217258, "train/grad_norm": 0.078125, "optim/muon_lr": 0.010362285524606704, "optim/adamw_lr": 0.00031086856573820114, "perf/tokens_per_sec": 2025984.2450775623, "perf/iters_per_sec": 0.9660645699870883, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351274967193604, "data/tokens_consumed": 34500247552, "data/tokens_consumed_B": 34.500247552, "train/loss_slope": -4.058741168363508e-05} {"step": 16460, "timestamp": 1778343498.0734923, "train/loss": 2.2853169679641723, "train/z_loss": 0.001388587022665888, "train/perplexity": 9.828801145415456, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.01018117941915989, "optim/adamw_lr": 0.00030543538257479666, "perf/tokens_per_sec": 2025434.971654342, "perf/iters_per_sec": 0.9658026560088835, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354082107543945, "data/tokens_consumed": 34521219072, "data/tokens_consumed_B": 34.521219072, "train/loss_slope": -4.147532985548766e-05} {"step": 16470, "timestamp": 1778343508.4483752, "train/loss": 2.2863542318344114, "train/z_loss": 0.0013931971043348312, "train/perplexity": 9.839001495043869, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.010000012675908465, "optim/adamw_lr": 0.00030000038027725394, "perf/tokens_per_sec": 2022689.4379953824, "perf/iters_per_sec": 0.9644934835411941, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368136405944823, "data/tokens_consumed": 34542190592, "data/tokens_consumed_B": 34.542190592, "train/loss_slope": -4.2770821819997856e-05} {"step": 16480, "timestamp": 1778343518.8034117, "train/loss": 2.2672143936157227, "train/z_loss": 0.0013796802377328277, "train/perplexity": 9.652475332676538, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.009818847123533488, "optim/adamw_lr": 0.0002945654137060046, "perf/tokens_per_sec": 2026244.1971172905, "perf/iters_per_sec": 0.9661885247789814, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349946975708009, "data/tokens_consumed": 34563162112, "data/tokens_consumed_B": 34.563162112, "train/loss_slope": -4.15846664841884e-05} {"step": 16490, "timestamp": 1778343529.175208, "train/loss": 2.2877882719039917, "train/z_loss": 0.0013746194075793029, "train/perplexity": 9.85312113908036, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.00963774099946022, "optim/adamw_lr": 0.00028913222998380657, "perf/tokens_per_sec": 2023330.9541445735, "perf/iters_per_sec": 0.9647993822787159, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0364849090576171, "data/tokens_consumed": 34584133632, "data/tokens_consumed_B": 34.584133632, "train/loss_slope": -4.272561795783765e-05} {"step": 16500, "timestamp": 1778343539.5306587, "grad/layer_0/attn": 0.002501517068594694, "grad/layer_0/mlp": 0.002923412946984172, "grad/layer_0/attn_mlp_ratio": 0.8556837601772659, "grad/layer_4/attn": 0.001931527629494667, "grad/layer_4/mlp": 0.0025200580712407827, "grad/layer_4/attn_mlp_ratio": 0.7664615251891804, "grad/layer_8/attn": 0.0033061057329177856, "grad/layer_8/mlp": 0.003288704203441739, "grad/layer_8/attn_mlp_ratio": 1.0052912721456397, "grad/layer_12/attn": 0.004833040293306112, "grad/layer_12/mlp": 0.006049225572496653, "grad/layer_12/attn_mlp_ratio": 0.7989518915255491, "grad/layer_16/attn": 0.0027778474614024162, "grad/layer_16/mlp": 0.004079550039023161, "grad/layer_16/attn_mlp_ratio": 0.680920044303602, "grad/layer_20/attn": 0.0038501524832099676, "grad/layer_20/mlp": 0.005024229176342487, "grad/layer_20/attn_mlp_ratio": 0.7663170351996321, "grad/layer_24/attn": 0.004256487358361483, "grad/layer_24/mlp": 0.006774715147912502, "grad/layer_24/attn_mlp_ratio": 0.6282902236626162, "grad/layer_27/attn": 0.004367482382804155, "grad/layer_27/mlp": 0.0062887175008654594, "grad/layer_27/attn_mlp_ratio": 0.6944949129538108} {"step": 16500, "timestamp": 1778343540.1146822, "eos/sharpness": 10.759687423706053, "eos/L0_probe": 2.266861915588379, "eos/L_plus": 2.3234081268310547, "eos/L_minus": 2.3179125785827637, "eos/grad_norm": 0.09392432868480682, "eos/embed_grad_frac": 0.22253847122192383, "eos/time_s": 0.5812942981719971} {"step": 16500, "timestamp": 1778343540.1325738, "train/loss": 2.287317728996277, "train/z_loss": 0.001383392489515245, "train/perplexity": 9.848485913431313, "train/grad_norm": 0.09375, "optim/muon_lr": 0.009456752613186837, "optim/adamw_lr": 0.00028370257839560505, "perf/tokens_per_sec": 1915421.4538539653, "perf/iters_per_sec": 0.9133441228170229, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.094877576828003, "data/tokens_consumed": 34605105152, "data/tokens_consumed_B": 34.605105152, "train/loss_slope": -4.281145770235746e-05} {"step": 16500, "timestamp": 1778343541.4932053, "geo/rankme_last": 431.0674743652344, "geo/layer_0/stable_rank_q_proj": 20.49913787841797, "geo/layer_0/stable_rank_k_proj": 16.62390899658203, "geo/layer_0/stable_rank_o_proj": 43.6541748046875, "geo/layer_0/stable_rank_gate_proj": 123.14279174804688, "geo/layer_0/stable_rank_down_proj": 57.958736419677734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06313364952802658, "geo/layer_0/attn_entropy_mean": 6.223470211029053, "geo/layer_0/attn_entropy_std": 0.46666374802589417, "geo/layer_7/stable_rank_q_proj": 41.59357833862305, "geo/layer_7/stable_rank_k_proj": 38.804107666015625, "geo/layer_7/stable_rank_o_proj": 86.82562255859375, "geo/layer_7/stable_rank_gate_proj": 76.9133071899414, "geo/layer_7/stable_rank_down_proj": 144.41119384765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3987247049808502, "geo/layer_7/attn_entropy_mean": 4.737910270690918, "geo/layer_7/attn_entropy_std": 0.741489052772522, "geo/layer_14/stable_rank_q_proj": 51.54664611816406, "geo/layer_14/stable_rank_k_proj": 44.545475006103516, "geo/layer_14/stable_rank_o_proj": 42.130615234375, "geo/layer_14/stable_rank_gate_proj": 71.6694107055664, "geo/layer_14/stable_rank_down_proj": 126.46302032470703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37848222255706787, "geo/layer_14/attn_entropy_mean": 5.520808696746826, "geo/layer_14/attn_entropy_std": 0.46536576747894287, "geo/layer_21/stable_rank_q_proj": 38.25006103515625, "geo/layer_21/stable_rank_k_proj": 28.473533630371094, "geo/layer_21/stable_rank_o_proj": 64.11349487304688, "geo/layer_21/stable_rank_gate_proj": 59.197418212890625, "geo/layer_21/stable_rank_down_proj": 48.716835021972656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1353728324174881, "geo/layer_21/attn_entropy_mean": 5.849176406860352, "geo/layer_21/attn_entropy_std": 0.33410191535949707, "geo/layer_27/stable_rank_q_proj": 45.50129318237305, "geo/layer_27/stable_rank_k_proj": 30.411357879638672, "geo/layer_27/stable_rank_o_proj": 106.48463439941406, "geo/layer_27/stable_rank_gate_proj": 68.3520736694336, "geo/layer_27/stable_rank_down_proj": 129.38014221191406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09209764748811722, "geo/layer_27/attn_entropy_mean": 4.300716400146484, "geo/layer_27/attn_entropy_std": 0.7098996639251709, "attnres/final_alpha/block_0": 0.26529574394226074, "attnres/block_norm/0": 1.785740613937378, "attnres/final_alpha/block_1": 0.003737790510058403, "attnres/block_norm/1": 50617.734375, "attnres/final_alpha/block_2": 0.0077140373177826405, "attnres/block_norm/2": 30323.59375, "attnres/final_alpha/block_3": 0.00982300192117691, "attnres/block_norm/3": 75114.234375, "attnres/final_alpha/block_4": 0.011434078216552734, "attnres/block_norm/4": 17807.02734375, "attnres/final_alpha/block_5": 0.6031283140182495, "attnres/block_norm/5": 7258.50048828125, "attnres/final_alpha/block_6": 0.0988670140504837, "attnres/block_norm/6": 49647.8203125, "geo/tier1_time_s": 1.3566527366638184, "geo/step": 16500.0, "geo/rankme_slope": 0.0001748783106992797} {"step": 16500, "timestamp": 1778343548.5642161, "geo/ww_alpha_mean": 7.669576712343147, "geo/ww_alpha_std": 4.837989211758694, "geo/ww_alpha_min": 1.3255103874856695, "geo/ww_alpha_max": 44.4701056976312, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.889694819885155, "geo/ww_alpha_by_type/k_proj": 4.62203950028921, "geo/ww_alpha_by_type/v_proj": 8.481264931985105, "geo/ww_alpha_by_type/o_proj": 9.707633689920314, "geo/ww_alpha_by_type/gate_proj": 7.642073668131041, "geo/ww_alpha_by_type/up_proj": 11.68095877305558, "geo/ww_alpha_by_type/down_proj": 7.761847682669571, "geo/twonn_id/layer_0": 0.7625459432601929, "geo/twonn_id/layer_7": 3.4317593574523926, "geo/twonn_id/layer_14": 5.228899955749512, "geo/twonn_id/layer_21": 8.514336585998535, "geo/twonn_id/layer_27": 5.987259387969971, "geo/tier2_time_s": 7.064152240753174} {"step": 16500, "timestamp": 1778343549.3948052, "eoc/jacobian_sigma/layer_0/attn": 1506.89697265625, "eoc/jacobian_sigma/layer_0/mlp": 10540.69140625, "eoc/jacobian_sigma/layer_0": 10540.69140625, "eoc/jacobian_sigma/layer_7/attn": 1.1334625482559204, "eoc/jacobian_sigma/layer_7/mlp": 1.870560646057129, "eoc/jacobian_sigma/layer_7": 1.870560646057129, "eoc/jacobian_sigma/layer_14/attn": 2.234578847885132, "eoc/jacobian_sigma/layer_14/mlp": 13.80944538116455, "eoc/jacobian_sigma/layer_14": 13.80944538116455, "eoc/jacobian_sigma/layer_21/attn": 1.0897098779678345, "eoc/jacobian_sigma/layer_21/mlp": 5.407545566558838, "eoc/jacobian_sigma/layer_21": 5.407545566558838, "eoc/jacobian_sigma/layer_27/attn": 3.8061227798461914, "eoc/jacobian_sigma/layer_27/mlp": 53.48180389404297, "eoc/jacobian_sigma/layer_27": 53.48180389404297, "eoc/layer0_sigma": 10540.69140625, "eoc/sigma_max": 53.48180389404297, "eoc/sigma_min": 1.870560646057129, "eoc/sigma_mean": 18.64233887195587, "eoc/time_s": 0.8221204280853271} {"step": 16510, "timestamp": 1778343559.7755156, "train/loss": 2.322266864776611, "train/z_loss": 0.0013818718958646058, "train/perplexity": 10.198767348249016, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.009275943785905839, "optim/adamw_lr": 0.0002782783135771751, "perf/tokens_per_sec": 1067845.134156763, "perf/iters_per_sec": 0.5091882391723457, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.9639102458953857, "data/tokens_consumed": 34626076672, "data/tokens_consumed_B": 34.626076672, "train/loss_slope": -4.2746683745065185e-05} {"step": 16520, "timestamp": 1778343570.1268692, "train/loss": 2.3154567003250124, "train/z_loss": 0.0013866186141967773, "train/perplexity": 10.129548030395629, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.009095372557640076, "optim/adamw_lr": 0.00027286117672920224, "perf/tokens_per_sec": 2027279.6732509614, "perf/iters_per_sec": 0.9666822782759482, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344660520553588, "data/tokens_consumed": 34647048192, "data/tokens_consumed_B": 34.647048192, "train/loss_slope": -4.291126713035989e-05} {"step": 16530, "timestamp": 1778343580.478996, "train/loss": 2.3126041650772096, "train/z_loss": 0.0013793471734970808, "train/perplexity": 10.100694310286556, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.008915097042918206, "optim/adamw_lr": 0.00026745291128754613, "perf/tokens_per_sec": 2026869.2416577504, "perf/iters_per_sec": 0.9664865692414047, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346755266189576, "data/tokens_consumed": 34668019712, "data/tokens_consumed_B": 34.668019712, "train/loss_slope": -4.354318277706373e-05} {"step": 16540, "timestamp": 1778343590.8366199, "train/loss": 2.271142268180847, "train/z_loss": 0.0013986308244057, "train/perplexity": 9.690463602765307, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.008735178858041764, "optim/adamw_lr": 0.0002620553657412529, "perf/tokens_per_sec": 2026039.0768057625, "perf/iters_per_sec": 0.9660907157925427, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035099482536316, "data/tokens_consumed": 34688991232, "data/tokens_consumed_B": 34.688991232, "train/loss_slope": -4.654423119676319e-05} {"step": 16550, "timestamp": 1778343601.1797173, "grad/layer_0/attn": 0.002657215343788266, "grad/layer_0/mlp": 0.003116884035989642, "grad/layer_0/attn_mlp_ratio": 0.8525229773883056, "grad/layer_4/attn": 0.0019252280471846461, "grad/layer_4/mlp": 0.0024343051481992006, "grad/layer_4/attn_mlp_ratio": 0.7908737199695276, "grad/layer_8/attn": 0.004666406661272049, "grad/layer_8/mlp": 0.0031599500216543674, "grad/layer_8/attn_mlp_ratio": 1.4767342779540418, "grad/layer_12/attn": 0.005830723792314529, "grad/layer_12/mlp": 0.006137925665825605, "grad/layer_12/attn_mlp_ratio": 0.9499501973090816, "grad/layer_16/attn": 0.0030714066233485937, "grad/layer_16/mlp": 0.004304924514144659, "grad/layer_16/attn_mlp_ratio": 0.7134635095018612, "grad/layer_20/attn": 0.0027872046921402216, "grad/layer_20/mlp": 0.0047285729087889194, "grad/layer_20/attn_mlp_ratio": 0.5894388617791649, "grad/layer_24/attn": 0.006350372917950153, "grad/layer_24/mlp": 0.007233739830553532, "grad/layer_24/attn_mlp_ratio": 0.8778823926372783, "grad/layer_27/attn": 0.004059283994138241, "grad/layer_27/mlp": 0.006106280721724033, "grad/layer_27/attn_mlp_ratio": 0.664771914795781} {"step": 16550, "timestamp": 1778343601.193833, "train/loss": 2.2885318040847777, "train/z_loss": 0.0013979997485876084, "train/perplexity": 9.86044997600386, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.008555675894021989, "optim/adamw_lr": 0.0002566702768206596, "perf/tokens_per_sec": 2025907.9989095773, "perf/iters_per_sec": 0.9660282129810225, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351664543151855, "data/tokens_consumed": 34709962752, "data/tokens_consumed_B": 34.709962752, "train/loss_slope": -4.5412536515797594e-05} {"step": 16560, "timestamp": 1778343611.5543149, "train/loss": 2.281105947494507, "train/z_loss": 0.0013828505994752049, "train/perplexity": 9.787498885963181, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.008376645743846894, "optim/adamw_lr": 0.00025129937231540677, "perf/tokens_per_sec": 2025142.2626001092, "perf/iters_per_sec": 0.9656630814552828, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355578660964966, "data/tokens_consumed": 34730934272, "data/tokens_consumed_B": 34.730934272, "train/loss_slope": -4.573886910251695e-05} {"step": 16570, "timestamp": 1778343621.912647, "train/loss": 2.2880468368530273, "train/z_loss": 0.0013899485114961863, "train/perplexity": 9.855669140243227, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.008198149651288986, "optim/adamw_lr": 0.0002459444895386696, "perf/tokens_per_sec": 2025699.9609921006, "perf/iters_per_sec": 0.9659290127716544, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035272765159607, "data/tokens_consumed": 34751905792, "data/tokens_consumed_B": 34.751905792, "train/loss_slope": -4.313482781841418e-05} {"step": 16575, "timestamp": 1778343627.678226, "eos/sharpness": 42.54088401794433, "eos/L0_probe": 2.2630136013031006, "eos/L_plus": 2.442967176437378, "eos/L_minus": 2.5084688663482666, "eos/grad_norm": 0.09996675699949265, "eos/embed_grad_frac": 0.19584020972251892, "eos/time_s": 0.5980491638183594} {"step": 16575, "timestamp": 1778343629.0545502, "geo/rankme_last": 431.3740539550781, "geo/layer_0/stable_rank_q_proj": 20.49077796936035, "geo/layer_0/stable_rank_k_proj": 16.61725425720215, "geo/layer_0/stable_rank_o_proj": 43.644386291503906, "geo/layer_0/stable_rank_gate_proj": 123.03825378417969, "geo/layer_0/stable_rank_down_proj": 57.952816009521484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06265098601579666, "geo/layer_0/attn_entropy_mean": 6.222927570343018, "geo/layer_0/attn_entropy_std": 0.4673004746437073, "geo/layer_7/stable_rank_q_proj": 41.60539245605469, "geo/layer_7/stable_rank_k_proj": 38.82798767089844, "geo/layer_7/stable_rank_o_proj": 86.78365325927734, "geo/layer_7/stable_rank_gate_proj": 76.88768768310547, "geo/layer_7/stable_rank_down_proj": 144.62387084960938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3984419107437134, "geo/layer_7/attn_entropy_mean": 4.750565528869629, "geo/layer_7/attn_entropy_std": 0.742874026298523, "geo/layer_14/stable_rank_q_proj": 51.53533935546875, "geo/layer_14/stable_rank_k_proj": 44.56854248046875, "geo/layer_14/stable_rank_o_proj": 42.11748504638672, "geo/layer_14/stable_rank_gate_proj": 71.69313049316406, "geo/layer_14/stable_rank_down_proj": 126.4931411743164, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3715556263923645, "geo/layer_14/attn_entropy_mean": 5.487653732299805, "geo/layer_14/attn_entropy_std": 0.46377918124198914, "geo/layer_21/stable_rank_q_proj": 38.232749938964844, "geo/layer_21/stable_rank_k_proj": 28.437847137451172, "geo/layer_21/stable_rank_o_proj": 64.06259155273438, "geo/layer_21/stable_rank_gate_proj": 59.18319320678711, "geo/layer_21/stable_rank_down_proj": 48.73200225830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13831524550914764, "geo/layer_21/attn_entropy_mean": 5.852963447570801, "geo/layer_21/attn_entropy_std": 0.3314329981803894, "geo/layer_27/stable_rank_q_proj": 45.492164611816406, "geo/layer_27/stable_rank_k_proj": 30.398481369018555, "geo/layer_27/stable_rank_o_proj": 106.47765350341797, "geo/layer_27/stable_rank_gate_proj": 68.36441802978516, "geo/layer_27/stable_rank_down_proj": 129.34925842285156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0955987498164177, "geo/layer_27/attn_entropy_mean": 4.295588493347168, "geo/layer_27/attn_entropy_std": 0.7082611918449402, "attnres/final_alpha/block_0": 0.2632533609867096, "attnres/block_norm/0": 1.7856601476669312, "attnres/final_alpha/block_1": 0.0036639522295445204, "attnres/block_norm/1": 50589.6015625, "attnres/final_alpha/block_2": 0.007682428695261478, "attnres/block_norm/2": 30388.38671875, "attnres/final_alpha/block_3": 0.009808309376239777, "attnres/block_norm/3": 75332.25, "attnres/final_alpha/block_4": 0.01123461127281189, "attnres/block_norm/4": 17805.76953125, "attnres/final_alpha/block_5": 0.6065487861633301, "attnres/block_norm/5": 7213.9716796875, "attnres/final_alpha/block_6": 0.0978085994720459, "attnres/block_norm/6": 49594.015625, "geo/tier1_time_s": 1.358109951019287, "geo/step": 16575.0, "geo/rankme_slope": 0.00021468479970113046} {"step": 16580, "timestamp": 1778343634.2346647, "train/loss": 2.3074544429779054, "train/z_loss": 0.0013947422616183759, "train/perplexity": 10.048812245344498, "train/grad_norm": 0.078125, "optim/muon_lr": 0.008020244985818863, "optim/adamw_lr": 0.00024060734957456588, "perf/tokens_per_sec": 1702693.7685973435, "perf/iters_per_sec": 0.8119076579081266, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.231667160987854, "data/tokens_consumed": 34772877312, "data/tokens_consumed_B": 34.772877312, "train/loss_slope": -4.001423436315647e-05} {"step": 16590, "timestamp": 1778343644.5897644, "train/loss": 2.276473546028137, "train/z_loss": 0.0013844740809872747, "train/perplexity": 9.742264115471919, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.007842988967895507, "optim/adamw_lr": 0.0002352896690368652, "perf/tokens_per_sec": 2026091.858064138, "perf/iters_per_sec": 0.9661158838577929, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350725173950195, "data/tokens_consumed": 34793848832, "data/tokens_consumed_B": 34.793848832, "train/loss_slope": -4.12167501921701e-05} {"step": 16600, "timestamp": 1778343654.9390247, "grad/layer_0/attn": 0.002530422993004322, "grad/layer_0/mlp": 0.003154786303639412, "grad/layer_0/attn_mlp_ratio": 0.8020901161755946, "grad/layer_4/attn": 0.0017109005711972713, "grad/layer_4/mlp": 0.0024074723478406668, "grad/layer_4/attn_mlp_ratio": 0.7106625759027185, "grad/layer_8/attn": 0.002859780564904213, "grad/layer_8/mlp": 0.0031315020751208067, "grad/layer_8/attn_mlp_ratio": 0.9132296275010215, "grad/layer_12/attn": 0.007397713139653206, "grad/layer_12/mlp": 0.006176826544106007, "grad/layer_12/attn_mlp_ratio": 1.1976559430742941, "grad/layer_16/attn": 0.008140075951814651, "grad/layer_16/mlp": 0.003942224197089672, "grad/layer_16/attn_mlp_ratio": 2.0648434331410375, "grad/layer_20/attn": 0.0021897146943956614, "grad/layer_20/mlp": 0.004988820757716894, "grad/layer_20/attn_mlp_ratio": 0.4389242983156088, "grad/layer_24/attn": 0.003946766257286072, "grad/layer_24/mlp": 0.006508259568363428, "grad/layer_24/attn_mlp_ratio": 0.6064242145210117, "grad/layer_27/attn": 0.004681205376982689, "grad/layer_27/mlp": 0.0058714840561151505, "grad/layer_27/attn_mlp_ratio": 0.7972780395067939} {"step": 16600, "timestamp": 1778343654.9532967, "train/loss": 2.284275507926941, "train/z_loss": 0.0014005088363774122, "train/perplexity": 9.818570170309213, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.0076664422452449804, "optim/adamw_lr": 0.00022999326735734937, "perf/tokens_per_sec": 2024771.1469314108, "perf/iters_per_sec": 0.965486119714456, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0357476711273192, "data/tokens_consumed": 34814820352, "data/tokens_consumed_B": 34.814820352, "train/loss_slope": -4.0519697788489184e-05} {"step": 16610, "timestamp": 1778343665.308084, "train/loss": 2.2789976596832275, "train/z_loss": 0.001392599765677005, "train/perplexity": 9.76688575819724, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.007490661442279816, "optim/adamw_lr": 0.00022471984326839444, "perf/tokens_per_sec": 2026552.632479927, "perf/iters_per_sec": 0.9663355982207904, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348371744155884, "data/tokens_consumed": 34835791872, "data/tokens_consumed_B": 34.835791872, "train/loss_slope": -4.219453814792948e-05} {"step": 16620, "timestamp": 1778343675.6750698, "train/loss": 2.2950359344482423, "train/z_loss": 0.001389975636266172, "train/perplexity": 9.92479264782528, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.00731570303440094, "optim/adamw_lr": 0.0002194710910320282, "perf/tokens_per_sec": 2024473.0849444363, "perf/iters_per_sec": 0.9653439926836187, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359001636505127, "data/tokens_consumed": 34856763392, "data/tokens_consumed_B": 34.856763392, "train/loss_slope": -4.089338298034209e-05} {"step": 16630, "timestamp": 1778343686.0328562, "train/loss": 2.273176145553589, "train/z_loss": 0.0014006303739733995, "train/perplexity": 9.7101928740765, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.00714162677526474, "optim/adamw_lr": 0.00021424880325794218, "perf/tokens_per_sec": 2025790.5145200156, "perf/iters_per_sec": 0.9659721920585707, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352264881134032, "data/tokens_consumed": 34877734912, "data/tokens_consumed_B": 34.877734912, "train/loss_slope": -4.102546651550072e-05} {"step": 16640, "timestamp": 1778343696.3891568, "train/loss": 2.301873207092285, "train/z_loss": 0.001383995299693197, "train/perplexity": 9.992883674288903, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.006968487799167633, "optim/adamw_lr": 0.00020905463397502896, "perf/tokens_per_sec": 2026076.0373983516, "perf/iters_per_sec": 0.9661083399764784, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350805997848511, "data/tokens_consumed": 34898706432, "data/tokens_consumed_B": 34.898706432, "train/loss_slope": -4.163671701547442e-05} {"step": 16650, "timestamp": 1778343706.7514567, "grad/layer_0/attn": 0.0027362043038010597, "grad/layer_0/mlp": 0.0032385638915002346, "grad/layer_0/attn_mlp_ratio": 0.8448819634203178, "grad/layer_4/attn": 0.0020783101208508015, "grad/layer_4/mlp": 0.002502921735867858, "grad/layer_4/attn_mlp_ratio": 0.8303535855845744, "grad/layer_8/attn": 0.0034903360065072775, "grad/layer_8/mlp": 0.00311109097674489, "grad/layer_8/attn_mlp_ratio": 1.121900940990512, "grad/layer_12/attn": 0.005581865087151527, "grad/layer_12/mlp": 0.006775568705052137, "grad/layer_12/attn_mlp_ratio": 0.8238223605654279, "grad/layer_16/attn": 0.005838421173393726, "grad/layer_16/mlp": 0.00397431431338191, "grad/layer_16/attn_mlp_ratio": 1.4690385727246904, "grad/layer_20/attn": 0.0023151645436882973, "grad/layer_20/mlp": 0.004837526008486748, "grad/layer_20/attn_mlp_ratio": 0.47858440280594505, "grad/layer_24/attn": 0.0035998590756207705, "grad/layer_24/mlp": 0.00658287713304162, "grad/layer_24/attn_mlp_ratio": 0.5468519232824056, "grad/layer_27/attn": 0.004622165579348803, "grad/layer_27/mlp": 0.006012136582285166, "grad/layer_27/attn_mlp_ratio": 0.7688058045932439} {"step": 16650, "timestamp": 1778343707.3373404, "eos/sharpness": 18.751716613769528, "eos/L0_probe": 2.2614316940307617, "eos/L_plus": 2.3657524585723877, "eos/L_minus": 2.344628095626831, "eos/grad_norm": 0.07933561503887177, "eos/embed_grad_frac": 0.2898100018501282, "eos/time_s": 0.5830667018890381} {"step": 16650, "timestamp": 1778343707.3657262, "train/loss": 2.300200843811035, "train/z_loss": 0.0013945354148745537, "train/perplexity": 9.976185908814838, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.006796345114707947, "optim/adamw_lr": 0.00020389035344123838, "perf/tokens_per_sec": 1911330.8369843473, "perf/iters_per_sec": 0.9113935646936165, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0972208261489869, "data/tokens_consumed": 34919677952, "data/tokens_consumed_B": 34.919677952, "train/loss_slope": -4.011866669378728e-05} {"step": 16650, "timestamp": 1778343708.727969, "geo/rankme_last": 430.6483459472656, "geo/layer_0/stable_rank_q_proj": 20.47801971435547, "geo/layer_0/stable_rank_k_proj": 16.615373611450195, "geo/layer_0/stable_rank_o_proj": 43.645416259765625, "geo/layer_0/stable_rank_gate_proj": 122.99797821044922, "geo/layer_0/stable_rank_down_proj": 57.962242126464844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06768288463354111, "geo/layer_0/attn_entropy_mean": 6.222105979919434, "geo/layer_0/attn_entropy_std": 0.4672917425632477, "geo/layer_7/stable_rank_q_proj": 41.608463287353516, "geo/layer_7/stable_rank_k_proj": 38.8212776184082, "geo/layer_7/stable_rank_o_proj": 86.76578521728516, "geo/layer_7/stable_rank_gate_proj": 76.91242980957031, "geo/layer_7/stable_rank_down_proj": 144.60081481933594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3930949568748474, "geo/layer_7/attn_entropy_mean": 4.745194435119629, "geo/layer_7/attn_entropy_std": 0.7448490262031555, "geo/layer_14/stable_rank_q_proj": 51.517303466796875, "geo/layer_14/stable_rank_k_proj": 44.557918548583984, "geo/layer_14/stable_rank_o_proj": 42.09996032714844, "geo/layer_14/stable_rank_gate_proj": 71.67279052734375, "geo/layer_14/stable_rank_down_proj": 126.4505615234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37804076075553894, "geo/layer_14/attn_entropy_mean": 5.504494667053223, "geo/layer_14/attn_entropy_std": 0.4642587900161743, "geo/layer_21/stable_rank_q_proj": 38.226898193359375, "geo/layer_21/stable_rank_k_proj": 28.446958541870117, "geo/layer_21/stable_rank_o_proj": 64.06256866455078, "geo/layer_21/stable_rank_gate_proj": 59.16074752807617, "geo/layer_21/stable_rank_down_proj": 48.73505401611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1359623521566391, "geo/layer_21/attn_entropy_mean": 5.850217819213867, "geo/layer_21/attn_entropy_std": 0.3266332447528839, "geo/layer_27/stable_rank_q_proj": 45.47909927368164, "geo/layer_27/stable_rank_k_proj": 30.40507698059082, "geo/layer_27/stable_rank_o_proj": 106.50666809082031, "geo/layer_27/stable_rank_gate_proj": 68.32882690429688, "geo/layer_27/stable_rank_down_proj": 129.3304443359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09466315060853958, "geo/layer_27/attn_entropy_mean": 4.280867576599121, "geo/layer_27/attn_entropy_std": 0.7122219204902649, "attnres/final_alpha/block_0": 0.26389625668525696, "attnres/block_norm/0": 1.7856049537658691, "attnres/final_alpha/block_1": 0.0036653277929872274, "attnres/block_norm/1": 50563.0390625, "attnres/final_alpha/block_2": 0.007738810032606125, "attnres/block_norm/2": 30347.439453125, "attnres/final_alpha/block_3": 0.009895018301904202, "attnres/block_norm/3": 75393.109375, "attnres/final_alpha/block_4": 0.011133571155369282, "attnres/block_norm/4": 17772.076171875, "attnres/final_alpha/block_5": 0.60548335313797, "attnres/block_norm/5": 7202.328125, "attnres/final_alpha/block_6": 0.09818767011165619, "attnres/block_norm/6": 49627.58203125, "geo/tier1_time_s": 1.3578503131866455, "geo/step": 16650.0, "geo/rankme_slope": 0.00020000146542992198} {"step": 16660, "timestamp": 1778343719.1147764, "train/loss": 2.235507941246033, "train/z_loss": 0.001400254131294787, "train/perplexity": 9.351230521755129, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.0066252538561820985, "optim/adamw_lr": 0.00019875761568546293, "perf/tokens_per_sec": 1785583.1684639303, "perf/iters_per_sec": 0.8514324037856723, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1744913578033447, "data/tokens_consumed": 34940649472, "data/tokens_consumed_B": 34.940649472, "train/loss_slope": -4.104434516098703e-05} {"step": 16670, "timestamp": 1778343729.471461, "train/loss": 2.243055248260498, "train/z_loss": 0.001400474924594164, "train/perplexity": 9.42207413242415, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.0064552691578865055, "optim/adamw_lr": 0.00019365807473659515, "perf/tokens_per_sec": 2026261.0939749961, "perf/iters_per_sec": 0.9661965818285924, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0349860668182373, "data/tokens_consumed": 34961620992, "data/tokens_consumed_B": 34.961620992, "train/loss_slope": -4.519643815043736e-05} {"step": 16680, "timestamp": 1778343739.8489761, "train/loss": 2.2767289161682127, "train/z_loss": 0.0013900274527259171, "train/perplexity": 9.744752316516344, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.006286449134349823, "optim/adamw_lr": 0.00018859347403049468, "perf/tokens_per_sec": 2022049.9554176722, "perf/iters_per_sec": 0.9641885544861184, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.037141537666321, "data/tokens_consumed": 34982592512, "data/tokens_consumed_B": 34.982592512, "train/loss_slope": -4.537541098756331e-05} {"step": 16690, "timestamp": 1778343750.222285, "train/loss": 2.323512148857117, "train/z_loss": 0.0013918788987211882, "train/perplexity": 10.211475621931765, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.006118848025798798, "optim/adamw_lr": 0.0001835654407739639, "perf/tokens_per_sec": 2022697.2986188247, "perf/iters_per_sec": 0.9644972317785381, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0368096113204956, "data/tokens_consumed": 35003564032, "data/tokens_consumed_B": 35.003564032, "train/loss_slope": -4.255682222484791e-05} {"step": 16700, "timestamp": 1778343760.5568645, "grad/layer_0/attn": 0.002635064534842968, "grad/layer_0/mlp": 0.0029579768888652325, "grad/layer_0/attn_mlp_ratio": 0.8908333448036243, "grad/layer_4/attn": 0.0016948635457083583, "grad/layer_4/mlp": 0.0025184117257595062, "grad/layer_4/attn_mlp_ratio": 0.6729890355391811, "grad/layer_8/attn": 0.005400878377258778, "grad/layer_8/mlp": 0.0033273284789174795, "grad/layer_8/attn_mlp_ratio": 1.6231875659890227, "grad/layer_12/attn": 0.005271660629659891, "grad/layer_12/mlp": 0.007097815163433552, "grad/layer_12/attn_mlp_ratio": 0.7427159532903562, "grad/layer_16/attn": 0.0035110577009618282, "grad/layer_16/mlp": 0.0038943737745285034, "grad/layer_16/attn_mlp_ratio": 0.9015718095086379, "grad/layer_20/attn": 0.0025292157661169767, "grad/layer_20/mlp": 0.004383689258247614, "grad/layer_20/attn_mlp_ratio": 0.5769605369865977, "grad/layer_24/attn": 0.006401400547474623, "grad/layer_24/mlp": 0.007224735803902149, "grad/layer_24/attn_mlp_ratio": 0.8860393836703657, "grad/layer_27/attn": 0.004428466781973839, "grad/layer_27/mlp": 0.006680045742541552, "grad/layer_27/attn_mlp_ratio": 0.6629395795117693} {"step": 16700, "timestamp": 1778343760.5712073, "train/loss": 2.283068084716797, "train/z_loss": 0.0013989627012051642, "train/perplexity": 9.806722155018887, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.005952519774436951, "optim/adamw_lr": 0.00017857559323310851, "perf/tokens_per_sec": 2027621.982372784, "perf/iters_per_sec": 0.9668455039848252, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034291410446167, "data/tokens_consumed": 35024535552, "data/tokens_consumed_B": 35.024535552, "train/loss_slope": -4.3505810303549266e-05} {"step": 16710, "timestamp": 1778343770.921869, "train/loss": 2.266732931137085, "train/z_loss": 0.0013876840472221374, "train/perplexity": 9.647829146549773, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.005787521004676819, "optim/adamw_lr": 0.00017362563014030455, "perf/tokens_per_sec": 2027513.3188324792, "perf/iters_per_sec": 0.9667936891710659, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034346842765808, "data/tokens_consumed": 35045507072, "data/tokens_consumed_B": 35.045507072, "train/loss_slope": -4.685562626220354e-05} {"step": 16720, "timestamp": 1778343781.2828648, "train/loss": 2.2602785348892214, "train/z_loss": 0.0013927605003118515, "train/perplexity": 9.585758763214457, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.00562390387058258, "optim/adamw_lr": 0.0001687171161174774, "perf/tokens_per_sec": 2025429.0951903383, "perf/iters_per_sec": 0.9657998538924877, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354112148284913, "data/tokens_consumed": 35066478592, "data/tokens_consumed_B": 35.066478592, "train/loss_slope": -4.78863860764662e-05} {"step": 16725, "timestamp": 1778343787.0376062, "eos/sharpness": 3.332734107971191, "eos/L0_probe": 2.258596420288086, "eos/L_plus": 2.281322717666626, "eos/L_minus": 2.269197463989258, "eos/grad_norm": 0.07156120985746384, "eos/embed_grad_frac": 0.39744865894317627, "eos/time_s": 0.5832564830780029} {"step": 16725, "timestamp": 1778343788.4138925, "geo/rankme_last": 430.6304931640625, "geo/layer_0/stable_rank_q_proj": 20.470888137817383, "geo/layer_0/stable_rank_k_proj": 16.601755142211914, "geo/layer_0/stable_rank_o_proj": 43.6417350769043, "geo/layer_0/stable_rank_gate_proj": 123.01099395751953, "geo/layer_0/stable_rank_down_proj": 57.97819900512695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06622999161481857, "geo/layer_0/attn_entropy_mean": 6.2204203605651855, "geo/layer_0/attn_entropy_std": 0.4668000638484955, "geo/layer_7/stable_rank_q_proj": 41.597198486328125, "geo/layer_7/stable_rank_k_proj": 38.81100082397461, "geo/layer_7/stable_rank_o_proj": 86.78865051269531, "geo/layer_7/stable_rank_gate_proj": 76.87155151367188, "geo/layer_7/stable_rank_down_proj": 144.6186065673828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3991791307926178, "geo/layer_7/attn_entropy_mean": 4.750179290771484, "geo/layer_7/attn_entropy_std": 0.7380221486091614, "geo/layer_14/stable_rank_q_proj": 51.52858352661133, "geo/layer_14/stable_rank_k_proj": 44.57194519042969, "geo/layer_14/stable_rank_o_proj": 42.100223541259766, "geo/layer_14/stable_rank_gate_proj": 71.66447448730469, "geo/layer_14/stable_rank_down_proj": 126.41932678222656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38168230652809143, "geo/layer_14/attn_entropy_mean": 5.495359897613525, "geo/layer_14/attn_entropy_std": 0.4735442101955414, "geo/layer_21/stable_rank_q_proj": 38.21592712402344, "geo/layer_21/stable_rank_k_proj": 28.445236206054688, "geo/layer_21/stable_rank_o_proj": 64.04422760009766, "geo/layer_21/stable_rank_gate_proj": 59.1614875793457, "geo/layer_21/stable_rank_down_proj": 48.73197937011719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13454337418079376, "geo/layer_21/attn_entropy_mean": 5.853933334350586, "geo/layer_21/attn_entropy_std": 0.3312121629714966, "geo/layer_27/stable_rank_q_proj": 45.46532440185547, "geo/layer_27/stable_rank_k_proj": 30.422115325927734, "geo/layer_27/stable_rank_o_proj": 106.49906158447266, "geo/layer_27/stable_rank_gate_proj": 68.31913757324219, "geo/layer_27/stable_rank_down_proj": 129.3305206298828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08995848149061203, "geo/layer_27/attn_entropy_mean": 4.282309055328369, "geo/layer_27/attn_entropy_std": 0.711995542049408, "attnres/final_alpha/block_0": 0.2646764814853668, "attnres/block_norm/0": 1.7856115102767944, "attnres/final_alpha/block_1": 0.0037068785168230534, "attnres/block_norm/1": 50559.74609375, "attnres/final_alpha/block_2": 0.007711193989962339, "attnres/block_norm/2": 30338.1796875, "attnres/final_alpha/block_3": 0.009788895025849342, "attnres/block_norm/3": 75492.7734375, "attnres/final_alpha/block_4": 0.011188005097210407, "attnres/block_norm/4": 17761.328125, "attnres/final_alpha/block_5": 0.6039431095123291, "attnres/block_norm/5": 7215.3193359375, "attnres/final_alpha/block_6": 0.098985455930233, "attnres/block_norm/6": 49596.01171875, "geo/tier1_time_s": 1.35798978805542, "geo/step": 16725.0, "geo/rankme_slope": 0.0002092885005564726} {"step": 16730, "timestamp": 1778343793.590688, "train/loss": 2.2519025087356566, "train/z_loss": 0.0013926234212704002, "train/perplexity": 9.505803518180326, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.00546172559261322, "optim/adamw_lr": 0.0001638517677783966, "perf/tokens_per_sec": 1704610.0905935646, "perf/iters_per_sec": 0.8128214314430068, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2302825212478639, "data/tokens_consumed": 35087450112, "data/tokens_consumed_B": 35.087450112, "train/loss_slope": -4.874227232236313e-05} {"step": 16740, "timestamp": 1778343804.3383245, "train/loss": 2.2906126737594605, "train/z_loss": 0.0013985548517666756, "train/perplexity": 9.88098965011872, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.005301034450531006, "optim/adamw_lr": 0.00015903103351593017, "perf/tokens_per_sec": 1952430.4154828314, "perf/iters_per_sec": 0.93099137090818, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0741238117218017, "data/tokens_consumed": 35108421632, "data/tokens_consumed_B": 35.108421632, "train/loss_slope": -4.895244367194896e-05} {"step": 16750, "timestamp": 1778343814.6841555, "grad/layer_0/attn": 0.0024342359974980354, "grad/layer_0/mlp": 0.00293755647726357, "grad/layer_0/attn_mlp_ratio": 0.8286601239747375, "grad/layer_4/attn": 0.001797491335310042, "grad/layer_4/mlp": 0.0025048968382179737, "grad/layer_4/attn_mlp_ratio": 0.7175909347347471, "grad/layer_8/attn": 0.004124753642827272, "grad/layer_8/mlp": 0.0031942110508680344, "grad/layer_8/attn_mlp_ratio": 1.2913215338648985, "grad/layer_12/attn": 0.004305542446672916, "grad/layer_12/mlp": 0.006118733435869217, "grad/layer_12/attn_mlp_ratio": 0.7036656231935876, "grad/layer_16/attn": 0.0025827193167060614, "grad/layer_16/mlp": 0.0039512538351118565, "grad/layer_16/attn_mlp_ratio": 0.6536454905505705, "grad/layer_20/attn": 0.002137213945388794, "grad/layer_20/mlp": 0.0047261398285627365, "grad/layer_20/attn_mlp_ratio": 0.45221131360762334, "grad/layer_24/attn": 0.004192324820905924, "grad/layer_24/mlp": 0.00673358328640461, "grad/layer_24/attn_mlp_ratio": 0.6225993769335961, "grad/layer_27/attn": 0.007273267023265362, "grad/layer_27/mlp": 0.006052622105926275, "grad/layer_27/attn_mlp_ratio": 1.2016720647364911} {"step": 16750, "timestamp": 1778343814.6983576, "train/loss": 2.2888116359710695, "train/z_loss": 0.0013905565487220884, "train/perplexity": 9.863209630421974, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.0051418858766555785, "optim/adamw_lr": 0.00015425657629966735, "perf/tokens_per_sec": 2025319.8737886315, "perf/iters_per_sec": 0.9657477730696828, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354670524597167, "data/tokens_consumed": 35129393152, "data/tokens_consumed_B": 35.129393152, "train/loss_slope": -4.902352704466765e-05} {"step": 16760, "timestamp": 1778343825.0486803, "train/loss": 2.281235909461975, "train/z_loss": 0.0013831470510922371, "train/perplexity": 9.788770971234555, "train/grad_norm": 0.06640625, "optim/muon_lr": 0.004984334111213684, "optim/adamw_lr": 0.0001495300233364105, "perf/tokens_per_sec": 2027119.984116913, "perf/iters_per_sec": 0.9666061325630727, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034547543525696, "data/tokens_consumed": 35150364672, "data/tokens_consumed_B": 35.150364672, "train/loss_slope": -5.051866644966038e-05} {"step": 16770, "timestamp": 1778343835.422261, "train/loss": 2.2468451976776125, "train/z_loss": 0.0013875441742129623, "train/perplexity": 9.457851070358737, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.004828426241874695, "optim/adamw_lr": 0.00014485278725624083, "perf/tokens_per_sec": 2023139.9177150133, "perf/iters_per_sec": 0.9647082890105311, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0365827798843383, "data/tokens_consumed": 35171336192, "data/tokens_consumed_B": 35.171336192, "train/loss_slope": -5.463691058188915e-05} {"step": 16780, "timestamp": 1778343845.7884383, "train/loss": 2.2462997436523438, "train/z_loss": 0.0013886653003282845, "train/perplexity": 9.452693654116613, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.004674216508865356, "optim/adamw_lr": 0.00014022649526596067, "perf/tokens_per_sec": 2024107.7106177893, "perf/iters_per_sec": 0.96516976862802, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.036087155342102, "data/tokens_consumed": 35192307712, "data/tokens_consumed_B": 35.192307712, "train/loss_slope": -5.7307048377567216e-05} {"step": 16790, "timestamp": 1778343856.136821, "train/loss": 2.273610472679138, "train/z_loss": 0.0013950857915915548, "train/perplexity": 9.714411190234278, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0045217561721801755, "optim/adamw_lr": 0.00013565268516540526, "perf/tokens_per_sec": 2027492.475458922, "perf/iters_per_sec": 0.9667837502760516, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034357476234436, "data/tokens_consumed": 35213279232, "data/tokens_consumed_B": 35.213279232, "train/loss_slope": -5.381280532514157e-05} {"step": 16800, "timestamp": 1778343866.4788907, "grad/layer_0/attn": 0.002487755147740245, "grad/layer_0/mlp": 0.0029612609650939703, "grad/layer_0/attn_mlp_ratio": 0.840099907794283, "grad/layer_4/attn": 0.001837817719206214, "grad/layer_4/mlp": 0.0023824686650186777, "grad/layer_4/attn_mlp_ratio": 0.7713921568209127, "grad/layer_8/attn": 0.0029357392340898514, "grad/layer_8/mlp": 0.0031102949287742376, "grad/layer_8/attn_mlp_ratio": 0.9438780588112959, "grad/layer_12/attn": 0.00673047685995698, "grad/layer_12/mlp": 0.006045460235327482, "grad/layer_12/attn_mlp_ratio": 1.1133108955535627, "grad/layer_16/attn": 0.0028198540676385164, "grad/layer_16/mlp": 0.0038014533929526806, "grad/layer_16/attn_mlp_ratio": 0.7417831292336206, "grad/layer_20/attn": 0.0023117410019040108, "grad/layer_20/mlp": 0.004544230177998543, "grad/layer_20/attn_mlp_ratio": 0.5087200384840955, "grad/layer_24/attn": 0.0049988883547484875, "grad/layer_24/mlp": 0.006747778505086899, "grad/layer_24/attn_mlp_ratio": 0.7408198530669059, "grad/layer_27/attn": 0.0033523181919008493, "grad/layer_27/mlp": 0.0058675725013017654, "grad/layer_27/attn_mlp_ratio": 0.5713296485086716} {"step": 16800, "timestamp": 1778343867.0589685, "eos/sharpness": 27.600216865539544, "eos/L0_probe": 2.2566394805908203, "eos/L_plus": 2.387772560119629, "eos/L_minus": 2.4015085697174072, "eos/grad_norm": 0.08312759548425674, "eos/embed_grad_frac": 0.2610781192779541, "eos/time_s": 0.5773804187774658} {"step": 16800, "timestamp": 1778343867.0781982, "train/loss": 2.2979448080062865, "train/z_loss": 0.0013976827380247415, "train/perplexity": 9.953704645012996, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.004371092319488525, "optim/adamw_lr": 0.00013113276958465574, "perf/tokens_per_sec": 1917910.3419556983, "perf/iters_per_sec": 0.9145309171465389, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.093456745147705, "data/tokens_consumed": 35234250752, "data/tokens_consumed_B": 35.234250752, "train/loss_slope": -5.038573962948967e-05} {"step": 16800, "timestamp": 1778343868.4379587, "geo/rankme_last": 430.914794921875, "geo/layer_0/stable_rank_q_proj": 20.467487335205078, "geo/layer_0/stable_rank_k_proj": 16.596141815185547, "geo/layer_0/stable_rank_o_proj": 43.63397979736328, "geo/layer_0/stable_rank_gate_proj": 122.9932861328125, "geo/layer_0/stable_rank_down_proj": 57.983482360839844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06485369056463242, "geo/layer_0/attn_entropy_mean": 6.220317840576172, "geo/layer_0/attn_entropy_std": 0.4662898778915405, "geo/layer_7/stable_rank_q_proj": 41.59455871582031, "geo/layer_7/stable_rank_k_proj": 38.813846588134766, "geo/layer_7/stable_rank_o_proj": 86.76834106445312, "geo/layer_7/stable_rank_gate_proj": 76.85588073730469, "geo/layer_7/stable_rank_down_proj": 144.6235809326172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.39580339193344116, "geo/layer_7/attn_entropy_mean": 4.753801345825195, "geo/layer_7/attn_entropy_std": 0.7406201958656311, "geo/layer_14/stable_rank_q_proj": 51.52339172363281, "geo/layer_14/stable_rank_k_proj": 44.5557746887207, "geo/layer_14/stable_rank_o_proj": 42.10359191894531, "geo/layer_14/stable_rank_gate_proj": 71.66645050048828, "geo/layer_14/stable_rank_down_proj": 126.4506607055664, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37966495752334595, "geo/layer_14/attn_entropy_mean": 5.496336460113525, "geo/layer_14/attn_entropy_std": 0.46984490752220154, "geo/layer_21/stable_rank_q_proj": 38.22477722167969, "geo/layer_21/stable_rank_k_proj": 28.440078735351562, "geo/layer_21/stable_rank_o_proj": 64.04422760009766, "geo/layer_21/stable_rank_gate_proj": 59.14196014404297, "geo/layer_21/stable_rank_down_proj": 48.72801208496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14149433374404907, "geo/layer_21/attn_entropy_mean": 5.849526405334473, "geo/layer_21/attn_entropy_std": 0.3319365084171295, "geo/layer_27/stable_rank_q_proj": 45.46615219116211, "geo/layer_27/stable_rank_k_proj": 30.418363571166992, "geo/layer_27/stable_rank_o_proj": 106.49376678466797, "geo/layer_27/stable_rank_gate_proj": 68.31078338623047, "geo/layer_27/stable_rank_down_proj": 129.33041381835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08777784556150436, "geo/layer_27/attn_entropy_mean": 4.281941890716553, "geo/layer_27/attn_entropy_std": 0.7133074998855591, "attnres/final_alpha/block_0": 0.26478588581085205, "attnres/block_norm/0": 1.785561203956604, "attnres/final_alpha/block_1": 0.00368365622125566, "attnres/block_norm/1": 50565.7265625, "attnres/final_alpha/block_2": 0.0076847923919558525, "attnres/block_norm/2": 30390.79296875, "attnres/final_alpha/block_3": 0.00985822081565857, "attnres/block_norm/3": 75598.65625, "attnres/final_alpha/block_4": 0.011205172166228294, "attnres/block_norm/4": 17740.52734375, "attnres/final_alpha/block_5": 0.6041099429130554, "attnres/block_norm/5": 7224.2939453125, "attnres/final_alpha/block_6": 0.0986722931265831, "attnres/block_norm/6": 49482.7109375, "geo/tier1_time_s": 1.3557794094085693, "geo/step": 16800.0, "geo/rankme_slope": 0.00026585677239645856} {"step": 16810, "timestamp": 1778343879.2708857, "train/loss": 2.293570327758789, "train/z_loss": 0.0013971437350846828, "train/perplexity": 9.91025745936571, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.004222276210784913, "optim/adamw_lr": 0.00012666828632354736, "perf/tokens_per_sec": 1720540.7116015404, "perf/iters_per_sec": 0.820417743492861, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2188912391662599, "data/tokens_consumed": 35255222272, "data/tokens_consumed_B": 35.255222272, "train/loss_slope": -5.098080281699131e-05} {"step": 16820, "timestamp": 1778343889.6388, "train/loss": 2.2524970531463624, "train/z_loss": 0.0013960397918708622, "train/perplexity": 9.511456820934576, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.004075358510017395, "optim/adamw_lr": 0.00012226075530052185, "perf/tokens_per_sec": 2027207.5343768334, "perf/iters_per_sec": 0.966647879780213, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345028638839722, "data/tokens_consumed": 35276193792, "data/tokens_consumed_B": 35.276193792, "train/loss_slope": -5.291436443162903e-05} {"step": 16830, "timestamp": 1778343899.9956293, "train/loss": 2.283615779876709, "train/z_loss": 0.0013971224310807885, "train/perplexity": 9.812094720407526, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.003930383324623108, "optim/adamw_lr": 0.00011791149973869323, "perf/tokens_per_sec": 2026153.9297291364, "perf/iters_per_sec": 0.9661454819341356, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035040807723999, "data/tokens_consumed": 35297165312, "data/tokens_consumed_B": 35.297165312, "train/loss_slope": -5.1151390459099005e-05} {"step": 16840, "timestamp": 1778343910.3570852, "train/loss": 2.2579181671142576, "train/z_loss": 0.0013938990421593189, "train/perplexity": 9.563159528875753, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.003787400722503662, "optim/adamw_lr": 0.00011362202167510986, "perf/tokens_per_sec": 2025231.6937966791, "perf/iters_per_sec": 0.9657057255729099, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035512137413025, "data/tokens_consumed": 35318136832, "data/tokens_consumed_B": 35.318136832, "train/loss_slope": -5.496125540288404e-05} {"step": 16850, "timestamp": 1778343920.7036693, "grad/layer_0/attn": 0.002620311686769128, "grad/layer_0/mlp": 0.0028932883869856596, "grad/layer_0/attn_mlp_ratio": 0.9056517172606837, "grad/layer_4/attn": 0.0018433568766340613, "grad/layer_4/mlp": 0.0024625647347420454, "grad/layer_4/attn_mlp_ratio": 0.7485516119729508, "grad/layer_8/attn": 0.0046434467658400536, "grad/layer_8/mlp": 0.0032156072556972504, "grad/layer_8/attn_mlp_ratio": 1.444034128611203, "grad/layer_12/attn": 0.005635004490613937, "grad/layer_12/mlp": 0.006019085645675659, "grad/layer_12/attn_mlp_ratio": 0.9361894361884344, "grad/layer_16/attn": 0.0037615057080984116, "grad/layer_16/mlp": 0.0038849469274282455, "grad/layer_16/attn_mlp_ratio": 0.9682257393837492, "grad/layer_20/attn": 0.0030633348505944014, "grad/layer_20/mlp": 0.0042204540222883224, "grad/layer_20/attn_mlp_ratio": 0.7258306243436825, "grad/layer_24/attn": 0.0037232921458780766, "grad/layer_24/mlp": 0.006177782546728849, "grad/layer_24/attn_mlp_ratio": 0.6026906996881104, "grad/layer_27/attn": 0.005842883139848709, "grad/layer_27/mlp": 0.005773945711553097, "grad/layer_27/attn_mlp_ratio": 1.0119393791603786} {"step": 16850, "timestamp": 1778343920.717989, "train/loss": 2.2688785791397095, "train/z_loss": 0.0013964406214654445, "train/perplexity": 9.668552216143757, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0036464571952819825, "optim/adamw_lr": 0.00010939371585845947, "perf/tokens_per_sec": 2025459.503927936, "perf/iters_per_sec": 0.9658143539085083, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353956699371338, "data/tokens_consumed": 35339108352, "data/tokens_consumed_B": 35.339108352, "train/loss_slope": -5.4145232786332525e-05} {"step": 16860, "timestamp": 1778343931.0735543, "train/loss": 2.2819369792938233, "train/z_loss": 0.0013907542335800827, "train/perplexity": 9.795635989400745, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.003507601022720337, "optim/adamw_lr": 0.0001052280306816101, "perf/tokens_per_sec": 2026557.0213705273, "perf/iters_per_sec": 0.9663376910069119, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348349332809448, "data/tokens_consumed": 35360079872, "data/tokens_consumed_B": 35.360079872, "train/loss_slope": -5.4666312049181296e-05} {"step": 16870, "timestamp": 1778343941.4262533, "train/loss": 2.281097149848938, "train/z_loss": 0.0013945180922746657, "train/perplexity": 9.787412779395744, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.003370874524116516, "optim/adamw_lr": 0.00010112623572349547, "perf/tokens_per_sec": 2026898.8529426926, "perf/iters_per_sec": 0.9665006890023673, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346604108810424, "data/tokens_consumed": 35381051392, "data/tokens_consumed_B": 35.381051392, "train/loss_slope": -5.403498710543051e-05} {"step": 16875, "timestamp": 1778343947.1780405, "eos/sharpness": 35.41407585144042, "eos/L0_probe": 2.255711078643799, "eos/L_plus": 2.4502651691436768, "eos/L_minus": 2.415297746658325, "eos/grad_norm": 0.08733533322811127, "eos/embed_grad_frac": 0.2464069128036499, "eos/time_s": 0.5814497470855713} {"step": 16875, "timestamp": 1778343948.554523, "geo/rankme_last": 431.13427734375, "geo/layer_0/stable_rank_q_proj": 20.46321678161621, "geo/layer_0/stable_rank_k_proj": 16.593059539794922, "geo/layer_0/stable_rank_o_proj": 43.636817932128906, "geo/layer_0/stable_rank_gate_proj": 122.99950408935547, "geo/layer_0/stable_rank_down_proj": 57.98952865600586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06722357869148254, "geo/layer_0/attn_entropy_mean": 6.220503330230713, "geo/layer_0/attn_entropy_std": 0.46604564785957336, "geo/layer_7/stable_rank_q_proj": 41.595481872558594, "geo/layer_7/stable_rank_k_proj": 38.813575744628906, "geo/layer_7/stable_rank_o_proj": 86.76403045654297, "geo/layer_7/stable_rank_gate_proj": 76.84506225585938, "geo/layer_7/stable_rank_down_proj": 144.6556854248047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3948434591293335, "geo/layer_7/attn_entropy_mean": 4.74381160736084, "geo/layer_7/attn_entropy_std": 0.7397783398628235, "geo/layer_14/stable_rank_q_proj": 51.52389907836914, "geo/layer_14/stable_rank_k_proj": 44.560123443603516, "geo/layer_14/stable_rank_o_proj": 42.106971740722656, "geo/layer_14/stable_rank_gate_proj": 71.67445373535156, "geo/layer_14/stable_rank_down_proj": 126.4629135131836, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37471768260002136, "geo/layer_14/attn_entropy_mean": 5.491299629211426, "geo/layer_14/attn_entropy_std": 0.4731951355934143, "geo/layer_21/stable_rank_q_proj": 38.225162506103516, "geo/layer_21/stable_rank_k_proj": 28.445091247558594, "geo/layer_21/stable_rank_o_proj": 64.0541763305664, "geo/layer_21/stable_rank_gate_proj": 59.136024475097656, "geo/layer_21/stable_rank_down_proj": 48.722965240478516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1383214294910431, "geo/layer_21/attn_entropy_mean": 5.84517765045166, "geo/layer_21/attn_entropy_std": 0.33376532793045044, "geo/layer_27/stable_rank_q_proj": 45.46786117553711, "geo/layer_27/stable_rank_k_proj": 30.417343139648438, "geo/layer_27/stable_rank_o_proj": 106.4898452758789, "geo/layer_27/stable_rank_gate_proj": 68.30753326416016, "geo/layer_27/stable_rank_down_proj": 129.3156280517578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.086654432117939, "geo/layer_27/attn_entropy_mean": 4.279061317443848, "geo/layer_27/attn_entropy_std": 0.710797131061554, "attnres/final_alpha/block_0": 0.2648402452468872, "attnres/block_norm/0": 1.7855560779571533, "attnres/final_alpha/block_1": 0.003691059537231922, "attnres/block_norm/1": 50540.0390625, "attnres/final_alpha/block_2": 0.007674204185605049, "attnres/block_norm/2": 30374.986328125, "attnres/final_alpha/block_3": 0.009801773354411125, "attnres/block_norm/3": 75668.1875, "attnres/final_alpha/block_4": 0.01117509976029396, "attnres/block_norm/4": 17714.396484375, "attnres/final_alpha/block_5": 0.6041929125785828, "attnres/block_norm/5": 7226.845703125, "attnres/final_alpha/block_6": 0.09862470626831055, "attnres/block_norm/6": 49476.484375, "geo/tier1_time_s": 1.3585875034332275, "geo/step": 16875.0, "geo/rankme_slope": 0.00028324073770133056} {"step": 16880, "timestamp": 1778343953.734591, "train/loss": 2.2501866102218626, "train/z_loss": 0.0013955855509266257, "train/perplexity": 9.48950651005569, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.003236322999000549, "optim/adamw_lr": 9.708968997001647e-05, "perf/tokens_per_sec": 1704794.7372496424, "perf/iters_per_sec": 0.8129094778297626, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.230149269104004, "data/tokens_consumed": 35402022912, "data/tokens_consumed_B": 35.402022912, "train/loss_slope": -5.47346450171598e-05} {"step": 16890, "timestamp": 1778343964.0977616, "train/loss": 2.2623862504959105, "train/z_loss": 0.001387712953146547, "train/perplexity": 9.605984123728106, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.003103994131088257, "optim/adamw_lr": 9.31198239326477e-05, "perf/tokens_per_sec": 2024585.5701992994, "perf/iters_per_sec": 0.9653976298328873, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358426094055175, "data/tokens_consumed": 35422994432, "data/tokens_consumed_B": 35.422994432, "train/loss_slope": -5.5755586964641304e-05} {"step": 16900, "timestamp": 1778343975.0209467, "grad/layer_0/attn": 0.0025220238603651524, "grad/layer_0/mlp": 0.0031708120368421078, "grad/layer_0/attn_mlp_ratio": 0.7953873492098147, "grad/layer_4/attn": 0.0017276729922741652, "grad/layer_4/mlp": 0.0025103120133280754, "grad/layer_4/attn_mlp_ratio": 0.6882303531506616, "grad/layer_8/attn": 0.004554435610771179, "grad/layer_8/mlp": 0.0030901243444532156, "grad/layer_8/attn_mlp_ratio": 1.4738680246183635, "grad/layer_12/attn": 0.009243114851415157, "grad/layer_12/mlp": 0.006191152147948742, "grad/layer_12/attn_mlp_ratio": 1.4929555083187616, "grad/layer_16/attn": 0.0031092967838048935, "grad/layer_16/mlp": 0.003714146325364709, "grad/layer_16/attn_mlp_ratio": 0.8371497587092511, "grad/layer_20/attn": 0.002178385853767395, "grad/layer_20/mlp": 0.004298565909266472, "grad/layer_20/attn_mlp_ratio": 0.5067703622723071, "grad/layer_24/attn": 0.003381506772711873, "grad/layer_24/mlp": 0.006102834362536669, "grad/layer_24/attn_mlp_ratio": 0.5540879067701823, "grad/layer_27/attn": 0.004283852409571409, "grad/layer_27/mlp": 0.005336544010788202, "grad/layer_27/attn_mlp_ratio": 0.8027390612046658} {"step": 16900, "timestamp": 1778343975.0356486, "train/loss": 2.274306297302246, "train/z_loss": 0.0013860572245903314, "train/perplexity": 9.721173069007486, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0029739266633987426, "optim/adamw_lr": 8.921779990196228e-05, "perf/tokens_per_sec": 1918887.2975864727, "perf/iters_per_sec": 0.9149967658932079, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.092900037765503, "data/tokens_consumed": 35443965952, "data/tokens_consumed_B": 35.443965952, "train/loss_slope": -5.416119060750986e-05} {"step": 16910, "timestamp": 1778343985.3955588, "train/loss": 2.246828389167786, "train/z_loss": 0.0013941737706772983, "train/perplexity": 9.45769209931212, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.002846165895462036, "optim/adamw_lr": 8.538497686386108e-05, "perf/tokens_per_sec": 2026310.2458289631, "perf/iters_per_sec": 0.9662200192589584, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034960961341858, "data/tokens_consumed": 35464937472, "data/tokens_consumed_B": 35.464937472, "train/loss_slope": -5.6296143272850856e-05} {"step": 16920, "timestamp": 1778343995.751212, "train/loss": 2.2558602571487425, "train/z_loss": 0.00139983898261562, "train/perplexity": 9.543499643654616, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.0027207541465759278, "optim/adamw_lr": 8.162262439727783e-05, "perf/tokens_per_sec": 2026090.131312793, "perf/iters_per_sec": 0.9661150604785885, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350733995437622, "data/tokens_consumed": 35485908992, "data/tokens_consumed_B": 35.485908992, "train/loss_slope": -5.599714786198777e-05} {"step": 16930, "timestamp": 1778344006.6703439, "train/loss": 2.2637088775634764, "train/z_loss": 0.0013892156188376249, "train/perplexity": 9.618697664123722, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0025977307558059694, "optim/adamw_lr": 7.793192267417907e-05, "perf/tokens_per_sec": 1921897.1253626079, "perf/iters_per_sec": 0.916431963616661, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0911884784698487, "data/tokens_consumed": 35506880512, "data/tokens_consumed_B": 35.506880512, "train/loss_slope": -5.524395242525661e-05} {"step": 16940, "timestamp": 1778344017.0376973, "train/loss": 2.268828535079956, "train/z_loss": 0.0014006364857777954, "train/perplexity": 9.668068374645719, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0024771368503570556, "optim/adamw_lr": 7.431410551071167e-05, "perf/tokens_per_sec": 2024544.8429426923, "perf/iters_per_sec": 0.9653782095635854, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035863447189331, "data/tokens_consumed": 35527852032, "data/tokens_consumed_B": 35.527852032, "train/loss_slope": -5.5813863003465005e-05} {"step": 16950, "timestamp": 1778344027.8730567, "grad/layer_0/attn": 0.002825099043548107, "grad/layer_0/mlp": 0.0032805567607283592, "grad/layer_0/attn_mlp_ratio": 0.8611644801428224, "grad/layer_4/attn": 0.0017977079842239618, "grad/layer_4/mlp": 0.0026238879654556513, "grad/layer_4/attn_mlp_ratio": 0.6851313544550083, "grad/layer_8/attn": 0.003052666550502181, "grad/layer_8/mlp": 0.003371791448444128, "grad/layer_8/attn_mlp_ratio": 0.9053544700623016, "grad/layer_12/attn": 0.007832146249711514, "grad/layer_12/mlp": 0.006202788557857275, "grad/layer_12/attn_mlp_ratio": 1.2626814617954585, "grad/layer_16/attn": 0.003223898820579052, "grad/layer_16/mlp": 0.003996110521256924, "grad/layer_16/attn_mlp_ratio": 0.8067591531199949, "grad/layer_20/attn": 0.0024273607414215803, "grad/layer_20/mlp": 0.00484348414465785, "grad/layer_20/attn_mlp_ratio": 0.5011600366200948, "grad/layer_24/attn": 0.0039311982691287994, "grad/layer_24/mlp": 0.006452064495533705, "grad/layer_24/attn_mlp_ratio": 0.6092930737007938, "grad/layer_27/attn": 0.0035836820024996996, "grad/layer_27/mlp": 0.00581763032823801, "grad/layer_27/attn_mlp_ratio": 0.6160037229427604} {"step": 16950, "timestamp": 1778344028.4596455, "eos/sharpness": 22.619342803955075, "eos/L0_probe": 2.2554969787597656, "eos/L_plus": 2.3577470779418945, "eos/L_minus": 2.3794403076171875, "eos/grad_norm": 0.08512407541275024, "eos/embed_grad_frac": 0.26073378324508667, "eos/time_s": 0.5832798480987549} {"step": 16950, "timestamp": 1778344028.4798129, "train/loss": 2.2764445304870606, "train/z_loss": 0.0013975391979329289, "train/perplexity": 9.741981442508273, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.002359013557434082, "optim/adamw_lr": 7.077040672302245e-05, "perf/tokens_per_sec": 1833921.9096137502, "perf/iters_per_sec": 0.8744821117466689, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.143533968925476, "data/tokens_consumed": 35548823552, "data/tokens_consumed_B": 35.548823552, "train/loss_slope": -5.53206365815949e-05} {"step": 16950, "timestamp": 1778344029.8462396, "geo/rankme_last": 430.7438659667969, "geo/layer_0/stable_rank_q_proj": 20.461400985717773, "geo/layer_0/stable_rank_k_proj": 16.590717315673828, "geo/layer_0/stable_rank_o_proj": 43.63758087158203, "geo/layer_0/stable_rank_gate_proj": 122.98759460449219, "geo/layer_0/stable_rank_down_proj": 57.99052429199219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06307957321405411, "geo/layer_0/attn_entropy_mean": 6.219995498657227, "geo/layer_0/attn_entropy_std": 0.4660643935203552, "geo/layer_7/stable_rank_q_proj": 41.58993148803711, "geo/layer_7/stable_rank_k_proj": 38.812015533447266, "geo/layer_7/stable_rank_o_proj": 86.75975036621094, "geo/layer_7/stable_rank_gate_proj": 76.84231567382812, "geo/layer_7/stable_rank_down_proj": 144.661865234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3948320150375366, "geo/layer_7/attn_entropy_mean": 4.74749755859375, "geo/layer_7/attn_entropy_std": 0.7404146194458008, "geo/layer_14/stable_rank_q_proj": 51.51899719238281, "geo/layer_14/stable_rank_k_proj": 44.55561065673828, "geo/layer_14/stable_rank_o_proj": 42.10850143432617, "geo/layer_14/stable_rank_gate_proj": 71.66792297363281, "geo/layer_14/stable_rank_down_proj": 126.46664428710938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3891564607620239, "geo/layer_14/attn_entropy_mean": 5.500212669372559, "geo/layer_14/attn_entropy_std": 0.4704016149044037, "geo/layer_21/stable_rank_q_proj": 38.22746658325195, "geo/layer_21/stable_rank_k_proj": 28.442354202270508, "geo/layer_21/stable_rank_o_proj": 64.04421997070312, "geo/layer_21/stable_rank_gate_proj": 59.134071350097656, "geo/layer_21/stable_rank_down_proj": 48.718299865722656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13929583132266998, "geo/layer_21/attn_entropy_mean": 5.851756572723389, "geo/layer_21/attn_entropy_std": 0.33297818899154663, "geo/layer_27/stable_rank_q_proj": 45.470523834228516, "geo/layer_27/stable_rank_k_proj": 30.417760848999023, "geo/layer_27/stable_rank_o_proj": 106.49237060546875, "geo/layer_27/stable_rank_gate_proj": 68.29951477050781, "geo/layer_27/stable_rank_down_proj": 129.30703735351562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09514971077442169, "geo/layer_27/attn_entropy_mean": 4.28121280670166, "geo/layer_27/attn_entropy_std": 0.7131768465042114, "attnres/final_alpha/block_0": 0.2644093632698059, "attnres/block_norm/0": 1.785487174987793, "attnres/final_alpha/block_1": 0.0036802315153181553, "attnres/block_norm/1": 50500.12109375, "attnres/final_alpha/block_2": 0.007650419604033232, "attnres/block_norm/2": 30401.70703125, "attnres/final_alpha/block_3": 0.00975878443568945, "attnres/block_norm/3": 75831.8515625, "attnres/final_alpha/block_4": 0.01112699881196022, "attnres/block_norm/4": 17758.21484375, "attnres/final_alpha/block_5": 0.6047747135162354, "attnres/block_norm/5": 7233.0712890625, "attnres/final_alpha/block_6": 0.09859945625066757, "attnres/block_norm/6": 49569.4609375, "geo/tier1_time_s": 1.36208176612854, "geo/step": 16950.0, "geo/rankme_slope": 0.00028749988276560625} {"step": 16960, "timestamp": 1778344040.2170846, "train/loss": 2.2597983360290526, "train/z_loss": 0.0013999810093082488, "train/perplexity": 9.581156797800213, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0022433972358703613, "optim/adamw_lr": 6.730191707611084e-05, "perf/tokens_per_sec": 1787393.5390324641, "perf/iters_per_sec": 0.8522956557428666, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1733017683029174, "data/tokens_consumed": 35569795072, "data/tokens_consumed_B": 35.569795072, "train/loss_slope": -5.462383010742944e-05} {"step": 16970, "timestamp": 1778344050.5827222, "train/loss": 2.2926640033721926, "train/z_loss": 0.001393734966404736, "train/perplexity": 9.901279620384594, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.002130326628684998, "optim/adamw_lr": 6.390979886054992e-05, "perf/tokens_per_sec": 2024378.7355780364, "perf/iters_per_sec": 0.9652990033998663, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0359484434127808, "data/tokens_consumed": 35590766592, "data/tokens_consumed_B": 35.590766592, "train/loss_slope": -5.4217657998557475e-05} {"step": 16980, "timestamp": 1778344060.938094, "train/loss": 2.291506552696228, "train/z_loss": 0.0013928417116403579, "train/perplexity": 9.88982600736982, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.0020198410749435423, "optim/adamw_lr": 6.059523224830627e-05, "perf/tokens_per_sec": 2026517.0551535112, "perf/iters_per_sec": 0.9663186336295658, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034855341911316, "data/tokens_consumed": 35611738112, "data/tokens_consumed_B": 35.611738112, "train/loss_slope": -5.264930491900971e-05} {"step": 16990, "timestamp": 1778344071.3017626, "train/loss": 2.2556292057037353, "train/z_loss": 0.0013947836123406887, "train/perplexity": 9.541294858990673, "train/grad_norm": 0.06640625, "optim/muon_lr": 0.001911972761154175, "optim/adamw_lr": 5.735918283462524e-05, "perf/tokens_per_sec": 2025075.45080774, "perf/iters_per_sec": 0.965631223110075, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355920314788818, "data/tokens_consumed": 35632709632, "data/tokens_consumed_B": 35.632709632, "train/loss_slope": -5.144603231666404e-05} {"step": 17000, "timestamp": 1778344081.6477456, "grad/layer_0/attn": 0.0024891761131584644, "grad/layer_0/mlp": 0.0026672903914004564, "grad/layer_0/attn_mlp_ratio": 0.9332227296515898, "grad/layer_4/attn": 0.0022443661000579596, "grad/layer_4/mlp": 0.0024165690410882235, "grad/layer_4/attn_mlp_ratio": 0.9287406935302013, "grad/layer_8/attn": 0.002952697454020381, "grad/layer_8/mlp": 0.0030536181293427944, "grad/layer_8/attn_mlp_ratio": 0.9669504280683661, "grad/layer_12/attn": 0.005541968625038862, "grad/layer_12/mlp": 0.0059925890527665615, "grad/layer_12/attn_mlp_ratio": 0.924803700664234, "grad/layer_16/attn": 0.0030014822259545326, "grad/layer_16/mlp": 0.0037632121238857508, "grad/layer_16/attn_mlp_ratio": 0.7975851605986004, "grad/layer_20/attn": 0.0028270671609789133, "grad/layer_20/mlp": 0.004104310646653175, "grad/layer_20/attn_mlp_ratio": 0.6888043658205513, "grad/layer_24/attn": 0.0039043703582137823, "grad/layer_24/mlp": 0.0058229900896549225, "grad/layer_24/attn_mlp_ratio": 0.6705095202032549, "grad/layer_27/attn": 0.004052889067679644, "grad/layer_27/mlp": 0.005256003234535456, "grad/layer_27/attn_mlp_ratio": 0.7710971264134201} {"step": 17000, "timestamp": 1778344081.6620042, "train/loss": 2.241297221183777, "train/z_loss": 0.0014037114335224033, "train/perplexity": 9.405524422661873, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0018067598342895509, "optim/adamw_lr": 5.420279502868652e-05, "perf/tokens_per_sec": 2025171.2639267328, "perf/iters_per_sec": 0.9656769103654541, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355430364608764, "data/tokens_consumed": 35653681152, "data/tokens_consumed_B": 35.653681152, "train/loss_slope": -5.0476675794677606e-05} {"step": 17000, "timestamp": 1778344088.823716, "geo/ww_alpha_mean": 7.571896228633995, "geo/ww_alpha_std": 4.258413769128912, "geo/ww_alpha_min": 1.370971539209714, "geo/ww_alpha_max": 24.64281656082464, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.879362493505146, "geo/ww_alpha_by_type/k_proj": 4.618689644657722, "geo/ww_alpha_by_type/v_proj": 8.532134802156815, "geo/ww_alpha_by_type/o_proj": 8.319383448733666, "geo/ww_alpha_by_type/gate_proj": 7.581992087896581, "geo/ww_alpha_by_type/up_proj": 12.28607455730644, "geo/ww_alpha_by_type/down_proj": 7.880286930984268, "geo/twonn_id/layer_0": 0.7292966246604919, "geo/twonn_id/layer_7": 3.2561323642730713, "geo/twonn_id/layer_14": 4.836643695831299, "geo/twonn_id/layer_21": 8.314148902893066, "geo/twonn_id/layer_27": 6.818349838256836, "geo/tier2_time_s": 7.154027938842773} {"step": 17000, "timestamp": 1778344089.6495364, "eoc/jacobian_sigma/layer_0/attn": 1471.7427978515625, "eoc/jacobian_sigma/layer_0/mlp": 9822.359375, "eoc/jacobian_sigma/layer_0": 9822.359375, "eoc/jacobian_sigma/layer_7/attn": 1.1273094415664673, "eoc/jacobian_sigma/layer_7/mlp": 1.8270543813705444, "eoc/jacobian_sigma/layer_7": 1.8270543813705444, "eoc/jacobian_sigma/layer_14/attn": 2.2127249240875244, "eoc/jacobian_sigma/layer_14/mlp": 16.699451446533203, "eoc/jacobian_sigma/layer_14": 16.699451446533203, "eoc/jacobian_sigma/layer_21/attn": 1.0882471799850464, "eoc/jacobian_sigma/layer_21/mlp": 5.6415205001831055, "eoc/jacobian_sigma/layer_21": 5.6415205001831055, "eoc/jacobian_sigma/layer_27/attn": 3.9706125259399414, "eoc/jacobian_sigma/layer_27/mlp": 53.12160873413086, "eoc/jacobian_sigma/layer_27": 53.12160873413086, "eoc/layer0_sigma": 9822.359375, "eoc/sigma_max": 53.12160873413086, "eoc/sigma_min": 1.8270543813705444, "eoc/sigma_mean": 19.322408765554428, "eoc/time_s": 0.8195061683654785} {"step": 17010, "timestamp": 1778344100.0218163, "train/loss": 2.2413862228393553, "train/z_loss": 0.0013941113254986703, "train/perplexity": 9.406361567160143, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.001704237461090088, "optim/adamw_lr": 5.112712383270263e-05, "perf/tokens_per_sec": 1142659.8491551662, "perf/iters_per_sec": 0.5448626752639609, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.8353248357772827, "data/tokens_consumed": 35674652672, "data/tokens_consumed_B": 35.674652672, "train/loss_slope": -5.301213553457557e-05} {"step": 17020, "timestamp": 1778344110.3722842, "train/loss": 2.241895914077759, "train/z_loss": 0.0013872791547328233, "train/perplexity": 9.411157129260355, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0016044366359710693, "optim/adamw_lr": 4.813309907913207e-05, "perf/tokens_per_sec": 2027403.7790784792, "perf/iters_per_sec": 0.9667414565460583, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344027280807495, "data/tokens_consumed": 35695624192, "data/tokens_consumed_B": 35.695624192, "train/loss_slope": -5.5164750016490384e-05} {"step": 17025, "timestamp": 1778344116.157374, "eos/sharpness": 22.895812988281246, "eos/L0_probe": 2.254692792892456, "eos/L_plus": 2.3616721630096436, "eos/L_minus": 2.376671552658081, "eos/grad_norm": 0.07669707387685776, "eos/embed_grad_frac": 0.2985984981060028, "eos/time_s": 0.6031489372253418} {"step": 17025, "timestamp": 1778344117.5371919, "geo/rankme_last": 430.8769836425781, "geo/layer_0/stable_rank_q_proj": 20.460222244262695, "geo/layer_0/stable_rank_k_proj": 16.590604782104492, "geo/layer_0/stable_rank_o_proj": 43.63642883300781, "geo/layer_0/stable_rank_gate_proj": 122.98550415039062, "geo/layer_0/stable_rank_down_proj": 57.99341583251953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06278891116380692, "geo/layer_0/attn_entropy_mean": 6.220607757568359, "geo/layer_0/attn_entropy_std": 0.4655885398387909, "geo/layer_7/stable_rank_q_proj": 41.592098236083984, "geo/layer_7/stable_rank_k_proj": 38.81126403808594, "geo/layer_7/stable_rank_o_proj": 86.76726531982422, "geo/layer_7/stable_rank_gate_proj": 76.83638000488281, "geo/layer_7/stable_rank_down_proj": 144.6642303466797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40014126896858215, "geo/layer_7/attn_entropy_mean": 4.742395401000977, "geo/layer_7/attn_entropy_std": 0.7407829761505127, "geo/layer_14/stable_rank_q_proj": 51.51839828491211, "geo/layer_14/stable_rank_k_proj": 44.547969818115234, "geo/layer_14/stable_rank_o_proj": 42.11027526855469, "geo/layer_14/stable_rank_gate_proj": 71.66449737548828, "geo/layer_14/stable_rank_down_proj": 126.46641540527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3733397424221039, "geo/layer_14/attn_entropy_mean": 5.4921464920043945, "geo/layer_14/attn_entropy_std": 0.47027626633644104, "geo/layer_21/stable_rank_q_proj": 38.22895812988281, "geo/layer_21/stable_rank_k_proj": 28.44147300720215, "geo/layer_21/stable_rank_o_proj": 64.04423522949219, "geo/layer_21/stable_rank_gate_proj": 59.13029479980469, "geo/layer_21/stable_rank_down_proj": 48.7176399230957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14292296767234802, "geo/layer_21/attn_entropy_mean": 5.847118854522705, "geo/layer_21/attn_entropy_std": 0.3332630693912506, "geo/layer_27/stable_rank_q_proj": 45.47022247314453, "geo/layer_27/stable_rank_k_proj": 30.41877555847168, "geo/layer_27/stable_rank_o_proj": 106.48858642578125, "geo/layer_27/stable_rank_gate_proj": 68.30036163330078, "geo/layer_27/stable_rank_down_proj": 129.31344604492188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.089671790599823, "geo/layer_27/attn_entropy_mean": 4.280117988586426, "geo/layer_27/attn_entropy_std": 0.7143875956535339, "attnres/final_alpha/block_0": 0.2644846439361572, "attnres/block_norm/0": 1.785402774810791, "attnres/final_alpha/block_1": 0.0036872420459985733, "attnres/block_norm/1": 50550.640625, "attnres/final_alpha/block_2": 0.0076419757679104805, "attnres/block_norm/2": 30390.7265625, "attnres/final_alpha/block_3": 0.009749757125973701, "attnres/block_norm/3": 75803.890625, "attnres/final_alpha/block_4": 0.01113863941282034, "attnres/block_norm/4": 17731.40234375, "attnres/final_alpha/block_5": 0.6046291589736938, "attnres/block_norm/5": 7225.72998046875, "attnres/final_alpha/block_6": 0.09866863489151001, "attnres/block_norm/6": 49511.84765625, "geo/tier1_time_s": 1.3618638515472412, "geo/step": 17025.0, "geo/rankme_slope": 0.0003371362607543017} {"step": 17030, "timestamp": 1778344122.7128828, "train/loss": 2.2592461347579955, "train/z_loss": 0.0014095247723162174, "train/perplexity": 9.575867531342505, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0015073913335800172, "optim/adamw_lr": 4.522174000740051e-05, "perf/tokens_per_sec": 1700195.40292359, "perf/iters_per_sec": 0.8107163443201018, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2334770441055298, "data/tokens_consumed": 35716595712, "data/tokens_consumed_B": 35.716595712, "train/loss_slope": -5.559130800832627e-05} {"step": 17040, "timestamp": 1778344133.0766764, "train/loss": 2.3051613569259644, "train/z_loss": 0.001384204300120473, "train/perplexity": 10.025795853515142, "train/grad_norm": 0.078125, "optim/muon_lr": 0.001413135528564453, "optim/adamw_lr": 4.239406585693359e-05, "perf/tokens_per_sec": 2025231.6005379285, "perf/iters_per_sec": 0.9657056811036723, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355121850967408, "data/tokens_consumed": 35737567232, "data/tokens_consumed_B": 35.737567232, "train/loss_slope": -5.3491732382466874e-05} {"step": 17050, "timestamp": 1778344143.4230175, "grad/layer_0/attn": 0.0027010953053832054, "grad/layer_0/mlp": 0.0030524348840117455, "grad/layer_0/attn_mlp_ratio": 0.8848985546067938, "grad/layer_4/attn": 0.0017312150448560715, "grad/layer_4/mlp": 0.002551211742684245, "grad/layer_4/attn_mlp_ratio": 0.6785853749544303, "grad/layer_8/attn": 0.003470394993200898, "grad/layer_8/mlp": 0.003403808455914259, "grad/layer_8/attn_mlp_ratio": 1.0195623332490138, "grad/layer_12/attn": 0.006732082460075617, "grad/layer_12/mlp": 0.006480386946350336, "grad/layer_12/attn_mlp_ratio": 1.0388395649712052, "grad/layer_16/attn": 0.0027744309045374393, "grad/layer_16/mlp": 0.004187153652310371, "grad/layer_16/attn_mlp_ratio": 0.6626054519747633, "grad/layer_20/attn": 0.0035236631520092487, "grad/layer_20/mlp": 0.004368176683783531, "grad/layer_20/attn_mlp_ratio": 0.8066667917586431, "grad/layer_24/attn": 0.0038251192308962345, "grad/layer_24/mlp": 0.005865830462425947, "grad/layer_24/attn_mlp_ratio": 0.6521018959187717, "grad/layer_27/attn": 0.0061493972316384315, "grad/layer_27/mlp": 0.005574832670390606, "grad/layer_27/attn_mlp_ratio": 1.1030639814524101} {"step": 17050, "timestamp": 1778344143.4354584, "train/loss": 2.27279155254364, "train/z_loss": 0.0014003454009070993, "train/perplexity": 9.70645911980579, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.001321696639060974, "optim/adamw_lr": 3.965089917182922e-05, "perf/tokens_per_sec": 2025381.198616483, "perf/iters_per_sec": 0.9657770150263228, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0354357004165649, "data/tokens_consumed": 35758538752, "data/tokens_consumed_B": 35.758538752, "train/loss_slope": -5.1861962033148006e-05} {"step": 17060, "timestamp": 1778344153.7913425, "train/loss": 2.2716536998748778, "train/z_loss": 0.0013969637104310096, "train/perplexity": 9.69542088052804, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.001233106255531311, "optim/adamw_lr": 3.699318766593933e-05, "perf/tokens_per_sec": 2026045.656812594, "perf/iters_per_sec": 0.9660938533843012, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350961208343505, "data/tokens_consumed": 35779510272, "data/tokens_consumed_B": 35.779510272, "train/loss_slope": -4.984238105531298e-05} {"step": 17070, "timestamp": 1778344164.1437314, "train/loss": 2.319962573051453, "train/z_loss": 0.001391262060496956, "train/perplexity": 10.175293468761483, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0011473941802978517, "optim/adamw_lr": 3.4421825408935545e-05, "perf/tokens_per_sec": 2027071.3535416105, "perf/iters_per_sec": 0.9665829436977437, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0345723628997803, "data/tokens_consumed": 35800481792, "data/tokens_consumed_B": 35.800481792, "train/loss_slope": -4.551585415194361e-05} {"step": 17080, "timestamp": 1778344174.5052829, "train/loss": 2.28857159614563, "train/z_loss": 0.0013830315787345172, "train/perplexity": 9.860842351435997, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0010645872354507447, "optim/adamw_lr": 3.193761706352234e-05, "perf/tokens_per_sec": 2024776.1806265404, "perf/iters_per_sec": 0.9654885199673369, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035745096206665, "data/tokens_consumed": 35821453312, "data/tokens_consumed_B": 35.821453312, "train/loss_slope": -4.3178624545995045e-05} {"step": 17090, "timestamp": 1778344184.8614345, "train/loss": 2.274631643295288, "train/z_loss": 0.001398240972775966, "train/perplexity": 9.724336328262114, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0009847134351730347, "optim/adamw_lr": 2.9541403055191037e-05, "perf/tokens_per_sec": 2026214.5116077643, "perf/iters_per_sec": 0.9661743696249792, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0350098609924316, "data/tokens_consumed": 35842424832, "data/tokens_consumed_B": 35.842424832, "train/loss_slope": -4.2215962271199756e-05} {"step": 17100, "timestamp": 1778344195.2033024, "grad/layer_0/attn": 0.002933525014668703, "grad/layer_0/mlp": 0.003154036356136203, "grad/layer_0/attn_mlp_ratio": 0.9300859566672119, "grad/layer_4/attn": 0.0016856902511790395, "grad/layer_4/mlp": 0.0025247163139283657, "grad/layer_4/attn_mlp_ratio": 0.6676750869441878, "grad/layer_8/attn": 0.004502283409237862, "grad/layer_8/mlp": 0.003443382680416107, "grad/layer_8/attn_mlp_ratio": 1.307517547814941, "grad/layer_12/attn": 0.006967822555452585, "grad/layer_12/mlp": 0.005677086766809225, "grad/layer_12/attn_mlp_ratio": 1.2273588054798976, "grad/layer_16/attn": 0.002745281672105193, "grad/layer_16/mlp": 0.0036594257690012455, "grad/layer_16/attn_mlp_ratio": 0.7501946399188716, "grad/layer_20/attn": 0.00195450522005558, "grad/layer_20/mlp": 0.004417255520820618, "grad/layer_20/attn_mlp_ratio": 0.44247048118362714, "grad/layer_24/attn": 0.005308966618031263, "grad/layer_24/mlp": 0.006681361235678196, "grad/layer_24/attn_mlp_ratio": 0.7945935493237881, "grad/layer_27/attn": 0.0052389102056622505, "grad/layer_27/mlp": 0.005793816410005093, "grad/layer_27/attn_mlp_ratio": 0.9042243910581914} {"step": 17100, "timestamp": 1778344195.789735, "eos/sharpness": 26.957869529724118, "eos/L0_probe": 2.255262613296509, "eos/L_plus": 2.4017515182495117, "eos/L_minus": 2.378352403640747, "eos/grad_norm": 0.07961742579936981, "eos/embed_grad_frac": 0.2749525010585785, "eos/time_s": 0.5836429595947266} {"step": 17100, "timestamp": 1778344195.8074121, "train/loss": 2.305117154121399, "train/z_loss": 0.0013902694219723344, "train/perplexity": 10.025352695014911, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.0009077996015548706, "optim/adamw_lr": 2.7233988046646116e-05, "perf/tokens_per_sec": 1917037.0710005835, "perf/iters_per_sec": 0.9141145091059606, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.093954849243164, "data/tokens_consumed": 35863396352, "data/tokens_consumed_B": 35.863396352, "train/loss_slope": -3.729830502104634e-05} {"step": 17100, "timestamp": 1778344197.1723564, "geo/rankme_last": 430.8981628417969, "geo/layer_0/stable_rank_q_proj": 20.460084915161133, "geo/layer_0/stable_rank_k_proj": 16.591201782226562, "geo/layer_0/stable_rank_o_proj": 43.637367248535156, "geo/layer_0/stable_rank_gate_proj": 122.99273681640625, "geo/layer_0/stable_rank_down_proj": 57.99523162841797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06312162429094315, "geo/layer_0/attn_entropy_mean": 6.221146583557129, "geo/layer_0/attn_entropy_std": 0.46581020951271057, "geo/layer_7/stable_rank_q_proj": 41.59414291381836, "geo/layer_7/stable_rank_k_proj": 38.8099365234375, "geo/layer_7/stable_rank_o_proj": 86.76728057861328, "geo/layer_7/stable_rank_gate_proj": 76.8341064453125, "geo/layer_7/stable_rank_down_proj": 144.67527770996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3960847556591034, "geo/layer_7/attn_entropy_mean": 4.741410255432129, "geo/layer_7/attn_entropy_std": 0.7406139969825745, "geo/layer_14/stable_rank_q_proj": 51.51541519165039, "geo/layer_14/stable_rank_k_proj": 44.54466247558594, "geo/layer_14/stable_rank_o_proj": 42.110713958740234, "geo/layer_14/stable_rank_gate_proj": 71.66625213623047, "geo/layer_14/stable_rank_down_proj": 126.46992492675781, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38297247886657715, "geo/layer_14/attn_entropy_mean": 5.49864387512207, "geo/layer_14/attn_entropy_std": 0.46945855021476746, "geo/layer_21/stable_rank_q_proj": 38.23131561279297, "geo/layer_21/stable_rank_k_proj": 28.443004608154297, "geo/layer_21/stable_rank_o_proj": 64.04422760009766, "geo/layer_21/stable_rank_gate_proj": 59.131614685058594, "geo/layer_21/stable_rank_down_proj": 48.720069885253906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1416490375995636, "geo/layer_21/attn_entropy_mean": 5.847084045410156, "geo/layer_21/attn_entropy_std": 0.3324848413467407, "geo/layer_27/stable_rank_q_proj": 45.46889114379883, "geo/layer_27/stable_rank_k_proj": 30.418672561645508, "geo/layer_27/stable_rank_o_proj": 106.4867172241211, "geo/layer_27/stable_rank_gate_proj": 68.29486846923828, "geo/layer_27/stable_rank_down_proj": 129.31130981445312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08580908924341202, "geo/layer_27/attn_entropy_mean": 4.279472351074219, "geo/layer_27/attn_entropy_std": 0.7146297693252563, "attnres/final_alpha/block_0": 0.26434072852134705, "attnres/block_norm/0": 1.7853422164916992, "attnres/final_alpha/block_1": 0.003674842184409499, "attnres/block_norm/1": 50511.46875, "attnres/final_alpha/block_2": 0.007649019360542297, "attnres/block_norm/2": 30391.8125, "attnres/final_alpha/block_3": 0.00976509042084217, "attnres/block_norm/3": 75753.1171875, "attnres/final_alpha/block_4": 0.011154872365295887, "attnres/block_norm/4": 17731.4609375, "attnres/final_alpha/block_5": 0.6048893928527832, "attnres/block_norm/5": 7226.9609375, "attnres/final_alpha/block_6": 0.09852607548236847, "attnres/block_norm/6": 49513.71875, "geo/tier1_time_s": 1.3607192039489746, "geo/step": 17100.0, "geo/rankme_slope": 0.00037316750919117646} {"step": 17110, "timestamp": 1778344207.5330415, "train/loss": 2.2625092267990112, "train/z_loss": 0.0013917512726038695, "train/perplexity": 9.607165504782735, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0008338695764541626, "optim/adamw_lr": 2.5016087293624876e-05, "perf/tokens_per_sec": 1789130.1409199676, "perf/iters_per_sec": 0.8531237320518339, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.172162914276123, "data/tokens_consumed": 35884367872, "data/tokens_consumed_B": 35.884367872, "train/loss_slope": -3.670653249635888e-05} {"step": 17120, "timestamp": 1778344217.8968463, "train/loss": 2.2540221214294434, "train/z_loss": 0.0013916628086008132, "train/perplexity": 9.525973508714515, "train/grad_norm": 0.06640625, "optim/muon_lr": 0.0007629477977752685, "optim/adamw_lr": 2.2888433933258055e-05, "perf/tokens_per_sec": 2024927.6226576692, "perf/iters_per_sec": 0.9655607331550928, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.035667634010315, "data/tokens_consumed": 35905339392, "data/tokens_consumed_B": 35.905339392, "train/loss_slope": -3.716307207159631e-05} {"step": 17130, "timestamp": 1778344228.2571816, "train/loss": 2.292231726646423, "train/z_loss": 0.0013911548303440213, "train/perplexity": 9.897000452608319, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.0006950587034225464, "optim/adamw_lr": 2.085176110267639e-05, "perf/tokens_per_sec": 2025487.5348767142, "perf/iters_per_sec": 0.9658277201064654, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353813409805297, "data/tokens_consumed": 35926310912, "data/tokens_consumed_B": 35.926310912, "train/loss_slope": -3.678840977130562e-05} {"step": 17140, "timestamp": 1778344238.6086748, "train/loss": 2.3037189722061155, "train/z_loss": 0.0013926775893196464, "train/perplexity": 10.011345222961406, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0006302231550216674, "optim/adamw_lr": 1.8906694650650022e-05, "perf/tokens_per_sec": 2026964.1969937645, "perf/iters_per_sec": 0.9665318474739859, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346270561218263, "data/tokens_consumed": 35947282432, "data/tokens_consumed_B": 35.947282432, "train/loss_slope": -3.594018349302748e-05} {"step": 17150, "timestamp": 1778344248.9519038, "grad/layer_0/attn": 0.00240392261184752, "grad/layer_0/mlp": 0.0029274921398609877, "grad/layer_0/attn_mlp_ratio": 0.8211542217313171, "grad/layer_4/attn": 0.001679440145380795, "grad/layer_4/mlp": 0.002408913103863597, "grad/layer_4/attn_mlp_ratio": 0.6971775249881075, "grad/layer_8/attn": 0.003950400277972221, "grad/layer_8/mlp": 0.0031539497431367636, "grad/layer_8/attn_mlp_ratio": 1.2525247624240425, "grad/layer_12/attn": 0.0051247612573206425, "grad/layer_12/mlp": 0.005592781584709883, "grad/layer_12/attn_mlp_ratio": 0.9163170576336348, "grad/layer_16/attn": 0.0035748693626374006, "grad/layer_16/mlp": 0.003906996920704842, "grad/layer_16/attn_mlp_ratio": 0.9149915763161951, "grad/layer_20/attn": 0.003135990584269166, "grad/layer_20/mlp": 0.004410835448652506, "grad/layer_20/attn_mlp_ratio": 0.7109742699945365, "grad/layer_24/attn": 0.00419076532125473, "grad/layer_24/mlp": 0.0060346839018166065, "grad/layer_24/attn_mlp_ratio": 0.6944465227994033, "grad/layer_27/attn": 0.004246549680829048, "grad/layer_27/mlp": 0.005356426350772381, "grad/layer_27/attn_mlp_ratio": 0.7927952936265388} {"step": 17150, "timestamp": 1778344248.9660351, "train/loss": 2.268400764465332, "train/z_loss": 0.0013935445807874204, "train/perplexity": 9.663933543537546, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0005684632062911987, "optim/adamw_lr": 1.705389618873596e-05, "perf/tokens_per_sec": 2026028.4368851483, "perf/iters_per_sec": 0.9660856422830335, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0351049184799195, "data/tokens_consumed": 35968253952, "data/tokens_consumed_B": 35.968253952, "train/loss_slope": -3.385425076054067e-05} {"step": 17160, "timestamp": 1778344259.3200743, "train/loss": 2.318076825141907, "train/z_loss": 0.0013894399744458497, "train/perplexity": 10.156123510908492, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.0005097991228103638, "optim/adamw_lr": 1.529397368431091e-05, "perf/tokens_per_sec": 2026770.8860915885, "perf/iters_per_sec": 0.9664396696527426, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0347257375717163, "data/tokens_consumed": 35989225472, "data/tokens_consumed_B": 35.989225472, "train/loss_slope": -2.8371791642169427e-05} {"step": 17170, "timestamp": 1778344269.6706755, "train/loss": 2.299088644981384, "train/z_loss": 0.0013880022685043513, "train/perplexity": 9.965096574438144, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0004542505741119385, "optim/adamw_lr": 1.3627517223358153e-05, "perf/tokens_per_sec": 2026944.4391987275, "perf/iters_per_sec": 0.9665224262231481, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0346371412277222, "data/tokens_consumed": 36010196992, "data/tokens_consumed_B": 36.010196992, "train/loss_slope": -2.6176569063385367e-05} {"step": 17175, "timestamp": 1778344275.4287248, "eos/sharpness": 5.976843833923339, "eos/L0_probe": 2.2544288635253906, "eos/L_plus": 2.2833669185638428, "eos/L_minus": 2.285259246826172, "eos/grad_norm": 0.07328469306230545, "eos/embed_grad_frac": 0.3312930464744568, "eos/time_s": 0.5821499824523926} {"step": 17175, "timestamp": 1778344276.8086166, "geo/rankme_last": 430.98138427734375, "geo/layer_0/stable_rank_q_proj": 20.460344314575195, "geo/layer_0/stable_rank_k_proj": 16.59040069580078, "geo/layer_0/stable_rank_o_proj": 43.63901901245117, "geo/layer_0/stable_rank_gate_proj": 122.99651336669922, "geo/layer_0/stable_rank_down_proj": 57.993404388427734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06335440278053284, "geo/layer_0/attn_entropy_mean": 6.221292018890381, "geo/layer_0/attn_entropy_std": 0.4656516909599304, "geo/layer_7/stable_rank_q_proj": 41.5881462097168, "geo/layer_7/stable_rank_k_proj": 38.81007385253906, "geo/layer_7/stable_rank_o_proj": 86.76045227050781, "geo/layer_7/stable_rank_gate_proj": 76.83500671386719, "geo/layer_7/stable_rank_down_proj": 144.6715087890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.399637371301651, "geo/layer_7/attn_entropy_mean": 4.744816780090332, "geo/layer_7/attn_entropy_std": 0.7404052019119263, "geo/layer_14/stable_rank_q_proj": 51.5165901184082, "geo/layer_14/stable_rank_k_proj": 44.54576873779297, "geo/layer_14/stable_rank_o_proj": 42.10809326171875, "geo/layer_14/stable_rank_gate_proj": 71.6678237915039, "geo/layer_14/stable_rank_down_proj": 126.47698211669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37972164154052734, "geo/layer_14/attn_entropy_mean": 5.498091220855713, "geo/layer_14/attn_entropy_std": 0.4698473811149597, "geo/layer_21/stable_rank_q_proj": 38.22940444946289, "geo/layer_21/stable_rank_k_proj": 28.44312286376953, "geo/layer_21/stable_rank_o_proj": 64.04425811767578, "geo/layer_21/stable_rank_gate_proj": 59.12997055053711, "geo/layer_21/stable_rank_down_proj": 48.71902084350586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13549141585826874, "geo/layer_21/attn_entropy_mean": 5.846807956695557, "geo/layer_21/attn_entropy_std": 0.3325124979019165, "geo/layer_27/stable_rank_q_proj": 45.46984100341797, "geo/layer_27/stable_rank_k_proj": 30.41795539855957, "geo/layer_27/stable_rank_o_proj": 106.4853515625, "geo/layer_27/stable_rank_gate_proj": 68.29074096679688, "geo/layer_27/stable_rank_down_proj": 129.311279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09345172345638275, "geo/layer_27/attn_entropy_mean": 4.278928756713867, "geo/layer_27/attn_entropy_std": 0.7119132876396179, "attnres/final_alpha/block_0": 0.2642505168914795, "attnres/block_norm/0": 1.785294532775879, "attnres/final_alpha/block_1": 0.0036777909845113754, "attnres/block_norm/1": 50537.8515625, "attnres/final_alpha/block_2": 0.00764557346701622, "attnres/block_norm/2": 30406.24609375, "attnres/final_alpha/block_3": 0.009759756736457348, "attnres/block_norm/3": 75626.5703125, "attnres/final_alpha/block_4": 0.011152363382279873, "attnres/block_norm/4": 17757.734375, "attnres/final_alpha/block_5": 0.6050028800964355, "attnres/block_norm/5": 7228.78271484375, "attnres/final_alpha/block_6": 0.09851113706827164, "attnres/block_norm/6": 49466.265625, "geo/tier1_time_s": 1.3600943088531494, "geo/step": 17175.0, "geo/rankme_slope": 0.00036472120098039214} {"step": 17180, "timestamp": 1778344281.9952004, "train/loss": 2.317234253883362, "train/z_loss": 0.0013921995065174996, "train/perplexity": 10.1475698571774, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0004018348455429077, "optim/adamw_lr": 1.205504536628723e-05, "perf/tokens_per_sec": 1702476.16349699, "perf/iters_per_sec": 0.8118038957104635, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2318245887756347, "data/tokens_consumed": 36031168512, "data/tokens_consumed_B": 36.031168512, "train/loss_slope": -2.4322261275238077e-05} {"step": 17190, "timestamp": 1778344292.3488448, "train/loss": 2.2736140966415403, "train/z_loss": 0.0013948031584732235, "train/perplexity": 9.71444639495898, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0003525698184967041, "optim/adamw_lr": 1.0577094554901122e-05, "perf/tokens_per_sec": 2026443.7567262028, "perf/iters_per_sec": 0.9662836822157873, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.034892773628235, "data/tokens_consumed": 36052140032, "data/tokens_consumed_B": 36.052140032, "train/loss_slope": -2.52051760189677e-05} {"step": 17200, "timestamp": 1778344302.6950393, "grad/layer_0/attn": 0.0026127928867936134, "grad/layer_0/mlp": 0.0029515395872294903, "grad/layer_0/attn_mlp_ratio": 0.8852304775363373, "grad/layer_4/attn": 0.0017625949112698436, "grad/layer_4/mlp": 0.00240507279522717, "grad/layer_4/attn_mlp_ratio": 0.7328654839392543, "grad/layer_8/attn": 0.0038415351882576942, "grad/layer_8/mlp": 0.003110515885055065, "grad/layer_8/attn_mlp_ratio": 1.2350154143926342, "grad/layer_12/attn": 0.003486773231998086, "grad/layer_12/mlp": 0.005700024776160717, "grad/layer_12/attn_mlp_ratio": 0.6117119324480249, "grad/layer_16/attn": 0.00278339977376163, "grad/layer_16/mlp": 0.0038536081556230783, "grad/layer_16/attn_mlp_ratio": 0.722284048904081, "grad/layer_20/attn": 0.002428550273180008, "grad/layer_20/mlp": 0.004324031062424183, "grad/layer_20/attn_mlp_ratio": 0.5616403263427198, "grad/layer_24/attn": 0.004417550750076771, "grad/layer_24/mlp": 0.005918697454035282, "grad/layer_24/attn_mlp_ratio": 0.746372104630511, "grad/layer_27/attn": 0.0035359093453735113, "grad/layer_27/mlp": 0.005632029380649328, "grad/layer_27/attn_mlp_ratio": 0.6278215264181908} {"step": 17200, "timestamp": 1778344302.7090375, "train/loss": 2.2652502775192263, "train/z_loss": 0.0014007641235366463, "train/perplexity": 9.633535356749196, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.000306471586227417, "optim/adamw_lr": 9.194147586822509e-06, "perf/tokens_per_sec": 2025254.3559276895, "perf/iters_per_sec": 0.9657165317190597, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0355005502700805, "data/tokens_consumed": 36073111552, "data/tokens_consumed_B": 36.073111552, "train/loss_slope": -2.4314402518170943e-05} {"step": 17210, "timestamp": 1778344313.0648043, "train/loss": 2.291729974746704, "train/z_loss": 0.0013976038433611393, "train/perplexity": 9.892035859430884, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.00026355504989624025, "optim/adamw_lr": 7.906651496887207e-06, "perf/tokens_per_sec": 2026606.7012023276, "perf/iters_per_sec": 0.9663613801967276, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348095655441285, "data/tokens_consumed": 36094083072, "data/tokens_consumed_B": 36.094083072, "train/loss_slope": -2.1473948074968144e-05} {"step": 17220, "timestamp": 1778344323.4230225, "train/loss": 2.304183840751648, "train/z_loss": 0.0013915877090767026, "train/perplexity": 10.016000264361358, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.00022383391857147216, "optim/adamw_lr": 6.715017557144164e-06, "perf/tokens_per_sec": 2025521.3502817047, "perf/iters_per_sec": 0.9658438445480846, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353640556335448, "data/tokens_consumed": 36115054592, "data/tokens_consumed_B": 36.115054592, "train/loss_slope": -2.2044048200596554e-05} {"step": 17230, "timestamp": 1778344333.7807555, "train/loss": 2.246718955039978, "train/z_loss": 0.0014071539626456797, "train/perplexity": 9.456657161655942, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.0001873224973678589, "optim/adamw_lr": 5.619674921035766e-06, "perf/tokens_per_sec": 2025763.5015693398, "perf/iters_per_sec": 0.9659593112799357, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0352402925491333, "data/tokens_consumed": 36136026112, "data/tokens_consumed_B": 36.136026112, "train/loss_slope": -2.244202991237245e-05} {"step": 17240, "timestamp": 1778344344.141123, "train/loss": 2.24309823513031, "train/z_loss": 0.0013917136588133872, "train/perplexity": 9.422479166603756, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.00015403151512145998, "optim/adamw_lr": 4.620945453643798e-06, "perf/tokens_per_sec": 2025600.599357733, "perf/iters_per_sec": 0.9658816334522882, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353235483169556, "data/tokens_consumed": 36156997632, "data/tokens_consumed_B": 36.156997632, "train/loss_slope": -2.2618360158884334e-05} {"step": 17250, "timestamp": 1778344354.4953668, "grad/layer_0/attn": 0.0024421175476163626, "grad/layer_0/mlp": 0.002802823670208454, "grad/layer_0/attn_mlp_ratio": 0.8713061354673524, "grad/layer_4/attn": 0.0019389154622331262, "grad/layer_4/mlp": 0.0024164007045328617, "grad/layer_4/attn_mlp_ratio": 0.8023981197969999, "grad/layer_8/attn": 0.003349142847582698, "grad/layer_8/mlp": 0.00317017431370914, "grad/layer_8/attn_mlp_ratio": 1.0564538131087124, "grad/layer_12/attn": 0.007803896442055702, "grad/layer_12/mlp": 0.00603973213583231, "grad/layer_12/attn_mlp_ratio": 1.2920931156114872, "grad/layer_16/attn": 0.0029478801880031824, "grad/layer_16/mlp": 0.0036642353516072035, "grad/layer_16/attn_mlp_ratio": 0.8045007551876029, "grad/layer_20/attn": 0.003080847440287471, "grad/layer_20/mlp": 0.0040289959870278835, "grad/layer_20/attn_mlp_ratio": 0.7646687595966756, "grad/layer_24/attn": 0.0032398237381130457, "grad/layer_24/mlp": 0.005805470049381256, "grad/layer_24/attn_mlp_ratio": 0.5580639732439835, "grad/layer_27/attn": 0.0035485655535012484, "grad/layer_27/mlp": 0.005242928862571716, "grad/layer_27/attn_mlp_ratio": 0.6768288448754104} {"step": 17250, "timestamp": 1778344355.0987148, "eos/sharpness": 3.558349609374999, "eos/L0_probe": 2.255789279937744, "eos/L_plus": 2.2696056365966797, "eos/L_minus": 2.2775564193725586, "eos/grad_norm": 0.06604783982038498, "eos/embed_grad_frac": 0.37682873010635376, "eos/time_s": 0.6006515026092529} {"step": 17250, "timestamp": 1778344355.1176975, "train/loss": 2.313147687911987, "train/z_loss": 0.0013873742311261595, "train/perplexity": 10.106185760520415, "train/grad_norm": 0.06591796875, "optim/muon_lr": 0.0001239722967147827, "optim/adamw_lr": 3.719168901443481e-06, "perf/tokens_per_sec": 1911789.0022366564, "perf/iters_per_sec": 0.911612034910515, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0969578742980957, "data/tokens_consumed": 36177969152, "data/tokens_consumed_B": 36.177969152, "train/loss_slope": -1.8436926204522415e-05} {"step": 17250, "timestamp": 1778344356.486373, "geo/rankme_last": 431.0931701660156, "geo/layer_0/stable_rank_q_proj": 20.46041488647461, "geo/layer_0/stable_rank_k_proj": 16.59011459350586, "geo/layer_0/stable_rank_o_proj": 43.63741683959961, "geo/layer_0/stable_rank_gate_proj": 122.99535369873047, "geo/layer_0/stable_rank_down_proj": 57.991539001464844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06777232140302658, "geo/layer_0/attn_entropy_mean": 6.221223831176758, "geo/layer_0/attn_entropy_std": 0.4661073386669159, "geo/layer_7/stable_rank_q_proj": 41.59152603149414, "geo/layer_7/stable_rank_k_proj": 38.80986785888672, "geo/layer_7/stable_rank_o_proj": 86.7701416015625, "geo/layer_7/stable_rank_gate_proj": 76.83543395996094, "geo/layer_7/stable_rank_down_proj": 144.6666259765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.40274253487586975, "geo/layer_7/attn_entropy_mean": 4.74475622177124, "geo/layer_7/attn_entropy_std": 0.7406396269798279, "geo/layer_14/stable_rank_q_proj": 51.517005920410156, "geo/layer_14/stable_rank_k_proj": 44.545230865478516, "geo/layer_14/stable_rank_o_proj": 42.11007308959961, "geo/layer_14/stable_rank_gate_proj": 71.66900634765625, "geo/layer_14/stable_rank_down_proj": 126.47346496582031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3801402151584625, "geo/layer_14/attn_entropy_mean": 5.499035835266113, "geo/layer_14/attn_entropy_std": 0.47031426429748535, "geo/layer_21/stable_rank_q_proj": 38.22847366333008, "geo/layer_21/stable_rank_k_proj": 28.443134307861328, "geo/layer_21/stable_rank_o_proj": 64.04420471191406, "geo/layer_21/stable_rank_gate_proj": 59.12894821166992, "geo/layer_21/stable_rank_down_proj": 48.71515655517578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13805238902568817, "geo/layer_21/attn_entropy_mean": 5.8481340408325195, "geo/layer_21/attn_entropy_std": 0.33333829045295715, "geo/layer_27/stable_rank_q_proj": 45.467071533203125, "geo/layer_27/stable_rank_k_proj": 30.418184280395508, "geo/layer_27/stable_rank_o_proj": 106.48726654052734, "geo/layer_27/stable_rank_gate_proj": 68.29486846923828, "geo/layer_27/stable_rank_down_proj": 129.31344604492188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08960580825805664, "geo/layer_27/attn_entropy_mean": 4.279221057891846, "geo/layer_27/attn_entropy_std": 0.7139060497283936, "attnres/final_alpha/block_0": 0.26466575264930725, "attnres/block_norm/0": 1.78524649143219, "attnres/final_alpha/block_1": 0.0036884688306599855, "attnres/block_norm/1": 50525.453125, "attnres/final_alpha/block_2": 0.007671086583286524, "attnres/block_norm/2": 30422.23046875, "attnres/final_alpha/block_3": 0.009752923622727394, "attnres/block_norm/3": 75645.546875, "attnres/final_alpha/block_4": 0.011169358156621456, "attnres/block_norm/4": 17754.6796875, "attnres/final_alpha/block_5": 0.6043951511383057, "attnres/block_norm/5": 7234.72802734375, "attnres/final_alpha/block_6": 0.09865722060203552, "attnres/block_norm/6": 49468.5546875, "geo/tier1_time_s": 1.364508867263794, "geo/step": 17250.0, "geo/rankme_slope": 0.00039082369275835334} {"step": 17260, "timestamp": 1778344366.844536, "train/loss": 2.2302972316741942, "train/z_loss": 0.001392236235551536, "train/perplexity": 9.302630705094968, "train/grad_norm": 0.0673828125, "optim/muon_lr": 9.715497493743897e-05, "optim/adamw_lr": 2.9146492481231687e-06, "perf/tokens_per_sec": 1788938.4172614054, "perf/iters_per_sec": 0.8530323110873248, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.1722885370254517, "data/tokens_consumed": 36198940672, "data/tokens_consumed_B": 36.198940672, "train/loss_slope": -1.8978513900202998e-05} {"step": 17270, "timestamp": 1778344377.1993418, "train/loss": 2.328557515144348, "train/z_loss": 0.0013919353135861457, "train/perplexity": 10.263126445873086, "train/grad_norm": 0.068359375, "optim/muon_lr": 7.358789443969727e-05, "optim/adamw_lr": 2.2076368331909178e-06, "perf/tokens_per_sec": 2026504.776130035, "perf/iters_per_sec": 0.9663127785349059, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0348616123199463, "data/tokens_consumed": 36219912192, "data/tokens_consumed_B": 36.219912192, "train/loss_slope": -1.4307159380336339e-05} {"step": 17280, "timestamp": 1778344387.5624673, "train/loss": 2.2885014533996584, "train/z_loss": 0.001396177348215133, "train/perplexity": 9.860150709133002, "train/grad_norm": 0.0849609375, "optim/muon_lr": 5.327880382537842e-05, "optim/adamw_lr": 1.5983641147613525e-06, "perf/tokens_per_sec": 2024575.3649319496, "perf/iters_per_sec": 0.9653927635822056, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0358478307724, "data/tokens_consumed": 36240883712, "data/tokens_consumed_B": 36.240883712, "train/loss_slope": -1.4111041293071351e-05} {"step": 17290, "timestamp": 1778344397.922803, "train/loss": 2.2527175426483153, "train/z_loss": 0.0014035382657311857, "train/perplexity": 9.51355422853155, "train/grad_norm": 0.06787109375, "optim/muon_lr": 3.6235451698303226e-05, "optim/adamw_lr": 1.0870635509490967e-06, "perf/tokens_per_sec": 2025456.6588999652, "perf/iters_per_sec": 0.9658129972934557, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0353971242904663, "data/tokens_consumed": 36261855232, "data/tokens_consumed_B": 36.261855232, "train/loss_slope": -1.4792397756888515e-05} {"step": 17300, "timestamp": 1778344408.2734535, "grad/layer_0/attn": 0.0021947831846773624, "grad/layer_0/mlp": 0.0025718642864376307, "grad/layer_0/attn_mlp_ratio": 0.8533821597480981, "grad/layer_4/attn": 0.001461389590986073, "grad/layer_4/mlp": 0.0023947374429553747, "grad/layer_4/attn_mlp_ratio": 0.610250419836219, "grad/layer_8/attn": 0.0027783496771007776, "grad/layer_8/mlp": 0.003092379542067647, "grad/layer_8/attn_mlp_ratio": 0.898450384068335, "grad/layer_12/attn": 0.006667733658105135, "grad/layer_12/mlp": 0.005674186162650585, "grad/layer_12/attn_mlp_ratio": 1.1750995384121272, "grad/layer_16/attn": 0.002443663775920868, "grad/layer_16/mlp": 0.0035790663678199053, "grad/layer_16/attn_mlp_ratio": 0.6827656870562009, "grad/layer_20/attn": 0.0033221591729670763, "grad/layer_20/mlp": 0.004122899379581213, "grad/layer_20/attn_mlp_ratio": 0.8057822387909704, "grad/layer_24/attn": 0.003428940661251545, "grad/layer_24/mlp": 0.0055342125706374645, "grad/layer_24/attn_mlp_ratio": 0.619589608372708, "grad/layer_27/attn": 0.0033316754270344973, "grad/layer_27/mlp": 0.005392055958509445, "grad/layer_27/attn_mlp_ratio": 0.6178859030548526} {"step": 17300, "timestamp": 1778344408.2874076, "train/loss": 2.273689341545105, "train/z_loss": 0.0013947498868219555, "train/perplexity": 9.715177385042448, "train/grad_norm": 0.07421875, "optim/muon_lr": 2.246201038360596e-05, "optim/adamw_lr": 6.738603115081786e-07, "perf/tokens_per_sec": 2025011.1142393013, "perf/iters_per_sec": 0.9656005450436121, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0356249332427978, "data/tokens_consumed": 36282826752, "data/tokens_consumed_B": 36.282826752, "train/loss_slope": -1.3551111003854059e-05} {"step": 17310, "timestamp": 1778344418.6537724, "train/loss": 2.290013003349304, "train/z_loss": 0.0013962003169581294, "train/perplexity": 9.875066089272073, "train/grad_norm": 0.0712890625, "optim/muon_lr": 1.1963248252868653e-05, "optim/adamw_lr": 3.588974475860595e-07, "perf/tokens_per_sec": 2023920.9515383511, "perf/iters_per_sec": 0.965080714959312, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0361827611923218, "data/tokens_consumed": 36303798272, "data/tokens_consumed_B": 36.303798272, "train/loss_slope": -1.3538593649327785e-05} {"step": 17320, "timestamp": 1778344429.0033646, "train/loss": 2.293636155128479, "train/z_loss": 0.0013981608906760811, "train/perplexity": 9.91090984701946, "train/grad_norm": 0.0703125, "optim/muon_lr": 4.74393367767334e-06, "optim/adamw_lr": 1.4231801033020018e-07, "perf/tokens_per_sec": 2027650.8210614999, "perf/iters_per_sec": 0.9668592553431987, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0342767000198365, "data/tokens_consumed": 36324769792, "data/tokens_consumed_B": 36.324769792, "train/loss_slope": -1.4033419399908194e-05} {"step": 17325, "timestamp": 1778344434.7531457, "eos/sharpness": 2.1291732788085933, "eos/L0_probe": 2.255098819732666, "eos/L_plus": 2.266777753829956, "eos/L_minus": 2.264711618423462, "eos/grad_norm": 0.07040704786777496, "eos/embed_grad_frac": 0.381563276052475, "eos/time_s": 0.5873198509216309} {"step": 17325, "timestamp": 1778344436.132002, "geo/rankme_last": 431.12603759765625, "geo/layer_0/stable_rank_q_proj": 20.46058464050293, "geo/layer_0/stable_rank_k_proj": 16.590166091918945, "geo/layer_0/stable_rank_o_proj": 43.638336181640625, "geo/layer_0/stable_rank_gate_proj": 122.99588775634766, "geo/layer_0/stable_rank_down_proj": 57.99134063720703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06611216813325882, "geo/layer_0/attn_entropy_mean": 6.221095561981201, "geo/layer_0/attn_entropy_std": 0.4666074216365814, "geo/layer_7/stable_rank_q_proj": 41.592491149902344, "geo/layer_7/stable_rank_k_proj": 38.8099479675293, "geo/layer_7/stable_rank_o_proj": 86.76081848144531, "geo/layer_7/stable_rank_gate_proj": 76.83895874023438, "geo/layer_7/stable_rank_down_proj": 144.66598510742188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.3939518332481384, "geo/layer_7/attn_entropy_mean": 4.743325710296631, "geo/layer_7/attn_entropy_std": 0.7405821681022644, "geo/layer_14/stable_rank_q_proj": 51.51805114746094, "geo/layer_14/stable_rank_k_proj": 44.543739318847656, "geo/layer_14/stable_rank_o_proj": 42.110252380371094, "geo/layer_14/stable_rank_gate_proj": 71.66548156738281, "geo/layer_14/stable_rank_down_proj": 126.4752426147461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3837595582008362, "geo/layer_14/attn_entropy_mean": 5.5013909339904785, "geo/layer_14/attn_entropy_std": 0.4690777361392975, "geo/layer_21/stable_rank_q_proj": 38.228172302246094, "geo/layer_21/stable_rank_k_proj": 28.443443298339844, "geo/layer_21/stable_rank_o_proj": 64.04421997070312, "geo/layer_21/stable_rank_gate_proj": 59.131065368652344, "geo/layer_21/stable_rank_down_proj": 48.718719482421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1341758519411087, "geo/layer_21/attn_entropy_mean": 5.846819877624512, "geo/layer_21/attn_entropy_std": 0.3328602910041809, "geo/layer_27/stable_rank_q_proj": 45.467472076416016, "geo/layer_27/stable_rank_k_proj": 30.417682647705078, "geo/layer_27/stable_rank_o_proj": 106.4880599975586, "geo/layer_27/stable_rank_gate_proj": 68.29488372802734, "geo/layer_27/stable_rank_down_proj": 129.311279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09188564866781235, "geo/layer_27/attn_entropy_mean": 4.277439594268799, "geo/layer_27/attn_entropy_std": 0.7137026190757751, "attnres/final_alpha/block_0": 0.26457974314689636, "attnres/block_norm/0": 1.7852104902267456, "attnres/final_alpha/block_1": 0.0036963310558348894, "attnres/block_norm/1": 50531.64453125, "attnres/final_alpha/block_2": 0.007709886878728867, "attnres/block_norm/2": 30400.3359375, "attnres/final_alpha/block_3": 0.009814562276005745, "attnres/block_norm/3": 75645.53125, "attnres/final_alpha/block_4": 0.01122736744582653, "attnres/block_norm/4": 17738.974609375, "attnres/final_alpha/block_5": 0.6046450138092041, "attnres/block_norm/5": 7234.30615234375, "attnres/final_alpha/block_6": 0.09832708537578583, "attnres/block_norm/6": 49452.546875, "geo/tier1_time_s": 1.3613648414611816, "geo/step": 17325.0, "geo/rankme_slope": 0.0004206758093862545} {"step": 17330, "timestamp": 1778344441.3046796, "train/loss": 2.3083338022232054, "train/z_loss": 0.0013861453393474222, "train/perplexity": 10.057652647671816, "train/grad_norm": 0.0654296875, "optim/muon_lr": 8.046627044677734e-07, "optim/adamw_lr": 2.41398811340332e-08, "perf/tokens_per_sec": 1705718.1078170377, "perf/iters_per_sec": 0.8133497752270878, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.2294833421707154, "data/tokens_consumed": 36345741312, "data/tokens_consumed_B": 36.345741312, "train/loss_slope": -1.0919412406328585e-05} {"step": 17336, "timestamp": 1778344447.5157073, "train/loss": 2.3416449626286826, "train/z_loss": 0.0013831357937306166, "train/perplexity": 10.398327361926201, "train/grad_norm": 0.0751953125, "optim/muon_lr": 1.6689300537109377e-08, "optim/adamw_lr": 5.006790161132812e-10, "perf/tokens_per_sec": 2027390.7727977252, "perf/iters_per_sec": 0.9667352546681047, "perf/gpu_mem_gb": 77.838755328, "perf/step_time_s": 1.0344093640645344, "data/tokens_consumed": 36358324224, "data/tokens_consumed_B": 36.358324224, "train/loss_slope": -2.6414954989244746e-06}